bitkeeper revision 1.1236.43.15 (42446ea83i0TVEFNdNTE8D6WBPWfaQ)
authorkaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>
Fri, 25 Mar 2005 20:03:52 +0000 (20:03 +0000)
committerkaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>
Fri, 25 Mar 2005 20:03:52 +0000 (20:03 +0000)
Move Linux 2.4 to writable pagetables. It doesn't boot, but that bug
is not caused by this changeset (I see exactly the same behaviour
with these changes backed out). Will need some investigation: first
on 2.0-testing to see if any fixes are needed there...
Signed-off-by: Keir Fraser <keir@xensource.com>
27 files changed:
.rootkeys
linux-2.4.29-xen-sparse/arch/xen/kernel/head.S
linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c
linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c
linux-2.4.29-xen-sparse/arch/xen/mm/fault.c
linux-2.4.29-xen-sparse/arch/xen/mm/init.c
linux-2.4.29-xen-sparse/fs/exec.c [deleted file]
linux-2.4.29-xen-sparse/include/asm-xen/page.h
linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h
linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h
linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h
linux-2.4.29-xen-sparse/mm/highmem.c
linux-2.4.29-xen-sparse/mm/memory.c
linux-2.4.29-xen-sparse/mm/mremap.c
linux-2.4.29-xen-sparse/mm/swapfile.c [deleted file]
linux-2.4.29-xen-sparse/mm/vmalloc.c [deleted file]
linux-2.6.11-xen-sparse/arch/xen/Kconfig
linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig
linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig
linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c
linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c
linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c
linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c
linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c
linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h
linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h
linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h

index ad520bef1be53cb49eec05bdc112b3ada75049ef..8dca99fafd1b3fd790431086669c32a203e4c1b0 100644 (file)
--- a/.rootkeys
+++ b/.rootkeys
 3e5a4e66rw65CxyolW9PKz4GG42RcA linux-2.4.29-xen-sparse/drivers/char/tty_io.c
 40c9c0c1pPwYE3-4i-oI3ubUu7UgvQ linux-2.4.29-xen-sparse/drivers/scsi/aic7xxx/Makefile
 41f97f64nW0wmgLxhwzPTzkF4E5ERA linux-2.4.29-xen-sparse/drivers/usb/hcd.c
-3e5a4e669uzIE54VwucPYtGwXLAbzA linux-2.4.29-xen-sparse/fs/exec.c
 3e5a4e66wbeCpsJgVf_U8Jde-CNcsA linux-2.4.29-xen-sparse/include/asm-xen/bugs.h
 3e5a4e66HdSkvIV6SJ1evG_xmTmXHA linux-2.4.29-xen-sparse/include/asm-xen/desc.h
 3e5a4e66SYp_UpAVcF8Lc1wa3Qtgzw linux-2.4.29-xen-sparse/include/asm-xen/fixmap.h
 3f108af5VxPkLv13tXpXgoRKALQtXQ linux-2.4.29-xen-sparse/mm/mprotect.c
 3e5a4e681xMPdF9xCMwpyfuYMySU5g linux-2.4.29-xen-sparse/mm/mremap.c
 409ba2e7akOFqQUg6Qyg2s28xcXiMg linux-2.4.29-xen-sparse/mm/page_alloc.c
-3e5a4e683HKVU-sxtagrDasRB8eBVw linux-2.4.29-xen-sparse/mm/swapfile.c
-41180721bNns9Na7w1nJ0ZVt8bhUNA linux-2.4.29-xen-sparse/mm/vmalloc.c
 41505c57WAd5l1rlfCLNSCpx9J13vA linux-2.4.29-xen-sparse/net/core/skbuff.c
 40f562372u3A7_kfbYYixPHJJxYUxA linux-2.6.11-xen-sparse/arch/xen/Kconfig
 40f56237utH41NPukqHksuNf29IC9A linux-2.6.11-xen-sparse/arch/xen/Kconfig.drivers
index cda41ae56ca7e012ff0324315618476dcf88552e..c856a0bd29fa00d4f2add5d3ba2d52410be15d52 100644 (file)
@@ -1,6 +1,9 @@
 
 .section __xen_guest
-    .asciz "GUEST_OS=linux,GUEST_VER=2.4,XEN_VER=3.0,VIRT_BASE=0xC0000000"
+    .ascii "GUEST_OS=linux,GUEST_VER=2.4,XEN_VER=3.0,VIRT_BASE=0xC0000000"
+    .ascii ",LOADER=generic"
+    .ascii ",PT_MODE_WRITABLE"
+    .byte  0
 
 .text
 #include <linux/config.h>
index 374c9b6c30b48212148911cd7e3c4b25f67849d8..61fc1eb8247e9bd16a5123f3e1275494f16d6ccc 100644 (file)
@@ -84,6 +84,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
        }
        memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
        make_pages_readonly(new->ldt, (new->size*LDT_ENTRY_SIZE)/PAGE_SIZE);
+       flush_page_update_queue();
        return 0;
 }
 
index ada06dd973f6bbd512e2cc0eb4818857f4d2cdda..f593714e02c3be97390a90f42fce81841d208769 100644 (file)
@@ -623,6 +623,7 @@ void __init trap_init(void)
     set_call_gate(&default_ldt[0],lcall7);
     set_call_gate(&default_ldt[4],lcall27);
     __make_page_readonly(&default_ldt[0]);
+    flush_page_update_queue();
 
     cpu_init();
 }
index d19218fe3237b16a913356a9b80d1a0716cb441e..49a0afc887c553d07d70b2091767ebe6a60d43c8 100644 (file)
@@ -296,7 +296,6 @@ vmalloc_fault:
                if (!pmd_present(*pmd_k))
                        goto no_context;
                set_pmd(pmd, *pmd_k);
-                XEN_flush_page_update_queue(); /* flush PMD update */
 
                pte_k = pte_offset(pmd_k, address);
                if (!pte_present(*pte_k))
index 40a5af9273b9ab027abf3dc37c7369712907461d..88d775bcd409e039b5e5be41499bd7e5664a37dc 100644 (file)
@@ -142,7 +142,7 @@ static inline void set_pte_phys (unsigned long vaddr,
     }
     pte = pte_offset(pmd, vaddr);
 
-    queue_l1_entry_update(pte, phys | pgprot_val(prot));
+    set_pte(pte, (pte_t) { phys | pgprot_val(prot) });
 
     /*
      * It's enough to flush this one mapping.
@@ -201,17 +201,13 @@ static void __init fixrange_init (unsigned long start,
                 kpgd = pgd_offset_k((unsigned long)pte);
                 kpmd = pmd_offset(kpgd, (unsigned long)pte);
                 kpte = pte_offset(kpmd, (unsigned long)pte);
-                queue_l1_entry_update(kpte,
-                                      (*(unsigned long *)kpte)&~_PAGE_RW);
-
+                set_pte(kpte, pte_wrprotect(*kpte));
                 set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte)));
             }
             vaddr += PMD_SIZE;
         }
         j = 0;
     }
-       
-    XEN_flush_page_update_queue();
 }
 
 
@@ -257,10 +253,8 @@ static void __init pagetable_init (void)
             kpgd = pgd_offset_k((unsigned long)pte_base);
             kpmd = pmd_offset(kpgd, (unsigned long)pte_base);
             kpte = pte_offset(kpmd, (unsigned long)pte_base);
-            queue_l1_entry_update(kpte,
-                                  (*(unsigned long *)kpte)&~_PAGE_RW);
+            set_pte(kpte, pte_wrprotect(*kpte));
             set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base)));
-            XEN_flush_page_update_queue();
         }
     }
 
@@ -311,6 +305,7 @@ void __init paging_init(void)
     pagetable_init();
 
     zone_sizes_init();
+
     /* Switch to the real shared_info page, and clear the dummy page. */
     set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info);
     HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
diff --git a/linux-2.4.29-xen-sparse/fs/exec.c b/linux-2.4.29-xen-sparse/fs/exec.c
deleted file mode 100644 (file)
index 8a11415..0000000
+++ /dev/null
@@ -1,1179 +0,0 @@
-/*
- *  linux/fs/exec.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-/*
- * #!-checking implemented by tytso.
- */
-/*
- * Demand-loading implemented 01.12.91 - no need to read anything but
- * the header into memory. The inode of the executable is put into
- * "current->executable", and page faults do the actual loading. Clean.
- *
- * Once more I can proudly say that linux stood up to being changed: it
- * was less than 2 hours work to get demand-loading completely implemented.
- *
- * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
- * current->executable is only used by the procfs.  This allows a dispatch
- * table to check for several different types  of binary formats.  We keep
- * trying until we recognize the file or we run out of supported binary
- * formats. 
- */
-
-#include <linux/config.h>
-#include <linux/slab.h>
-#include <linux/file.h>
-#include <linux/mman.h>
-#include <linux/a.out.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/smp_lock.h>
-#include <linux/init.h>
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-#include <linux/spinlock.h>
-#include <linux/personality.h>
-#include <linux/swap.h>
-#include <linux/utsname.h>
-#define __NO_VERSION__
-#include <linux/module.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgalloc.h>
-#include <asm/mmu_context.h>
-
-#ifdef CONFIG_KMOD
-#include <linux/kmod.h>
-#endif
-
-int core_uses_pid;
-char core_pattern[65] = "core";
-int core_setuid_ok = 0;
-/* The maximal length of core_pattern is also specified in sysctl.c */ 
-
-static struct linux_binfmt *formats;
-static rwlock_t binfmt_lock = RW_LOCK_UNLOCKED;
-
-int register_binfmt(struct linux_binfmt * fmt)
-{
-       struct linux_binfmt ** tmp = &formats;
-
-       if (!fmt)
-               return -EINVAL;
-       if (fmt->next)
-               return -EBUSY;
-       write_lock(&binfmt_lock);
-       while (*tmp) {
-               if (fmt == *tmp) {
-                       write_unlock(&binfmt_lock);
-                       return -EBUSY;
-               }
-               tmp = &(*tmp)->next;
-       }
-       fmt->next = formats;
-       formats = fmt;
-       write_unlock(&binfmt_lock);
-       return 0;       
-}
-
-int unregister_binfmt(struct linux_binfmt * fmt)
-{
-       struct linux_binfmt ** tmp = &formats;
-
-       write_lock(&binfmt_lock);
-       while (*tmp) {
-               if (fmt == *tmp) {
-                       *tmp = fmt->next;
-                       write_unlock(&binfmt_lock);
-                       return 0;
-               }
-               tmp = &(*tmp)->next;
-       }
-       write_unlock(&binfmt_lock);
-       return -EINVAL;
-}
-
-static inline void put_binfmt(struct linux_binfmt * fmt)
-{
-       if (fmt->module)
-               __MOD_DEC_USE_COUNT(fmt->module);
-}
-
-/*
- * Note that a shared library must be both readable and executable due to
- * security reasons.
- *
- * Also note that we take the address to load from from the file itself.
- */
-asmlinkage long sys_uselib(const char * library)
-{
-       struct file * file;
-       struct nameidata nd;
-       int error;
-
-       error = user_path_walk(library, &nd);
-       if (error)
-               goto out;
-
-       error = -EINVAL;
-       if (!S_ISREG(nd.dentry->d_inode->i_mode))
-               goto exit;
-
-       error = permission(nd.dentry->d_inode, MAY_READ | MAY_EXEC);
-       if (error)
-               goto exit;
-
-       file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
-       error = PTR_ERR(file);
-       if (IS_ERR(file))
-               goto out;
-
-       error = -ENOEXEC;
-       if(file->f_op && file->f_op->read) {
-               struct linux_binfmt * fmt;
-
-               read_lock(&binfmt_lock);
-               for (fmt = formats ; fmt ; fmt = fmt->next) {
-                       if (!fmt->load_shlib)
-                               continue;
-                       if (!try_inc_mod_count(fmt->module))
-                               continue;
-                       read_unlock(&binfmt_lock);
-                       error = fmt->load_shlib(file);
-                       read_lock(&binfmt_lock);
-                       put_binfmt(fmt);
-                       if (error != -ENOEXEC)
-                               break;
-               }
-               read_unlock(&binfmt_lock);
-       }
-       fput(file);
-out:
-       return error;
-exit:
-       path_release(&nd);
-       goto out;
-}
-
-/*
- * count() counts the number of arguments/envelopes
- */
-static int count(char ** argv, int max)
-{
-       int i = 0;
-
-       if (argv != NULL) {
-               for (;;) {
-                       char * p;
-
-                       if (get_user(p, argv))
-                               return -EFAULT;
-                       if (!p)
-                               break;
-                       argv++;
-                       if(++i > max)
-                               return -E2BIG;
-               }
-       }
-       return i;
-}
-
-/*
- * 'copy_strings()' copies argument/envelope strings from user
- * memory to free pages in kernel mem. These are in a format ready
- * to be put directly into the top of new user memory.
- */
-int copy_strings(int argc,char ** argv, struct linux_binprm *bprm) 
-{
-       struct page *kmapped_page = NULL;
-       char *kaddr = NULL;
-       int ret;
-
-       while (argc-- > 0) {
-               char *str;
-               int len;
-               unsigned long pos;
-
-               if (get_user(str, argv+argc) ||
-                               !(len = strnlen_user(str, bprm->p))) {
-                       ret = -EFAULT;
-                       goto out;
-               }
-
-               if (bprm->p < len)  {
-                       ret = -E2BIG;
-                       goto out;
-               }
-
-               bprm->p -= len;
-               /* XXX: add architecture specific overflow check here. */ 
-               pos = bprm->p;
-
-               while (len > 0) {
-                       int i, new, err;
-                       int offset, bytes_to_copy;
-                       struct page *page;
-
-                       offset = pos % PAGE_SIZE;
-                       i = pos/PAGE_SIZE;
-                       page = bprm->page[i];
-                       new = 0;
-                       if (!page) {
-                               page = alloc_page(GFP_HIGHUSER);
-                               bprm->page[i] = page;
-                               if (!page) {
-                                       ret = -ENOMEM;
-                                       goto out;
-                               }
-                               new = 1;
-                       }
-
-                       if (page != kmapped_page) {
-                               if (kmapped_page)
-                                       kunmap(kmapped_page);
-                               kmapped_page = page;
-                               kaddr = kmap(kmapped_page);
-                       }
-                       if (new && offset)
-                               memset(kaddr, 0, offset);
-                       bytes_to_copy = PAGE_SIZE - offset;
-                       if (bytes_to_copy > len) {
-                               bytes_to_copy = len;
-                               if (new)
-                                       memset(kaddr+offset+len, 0,
-                                               PAGE_SIZE-offset-len);
-                       }
-                       err = copy_from_user(kaddr+offset, str, bytes_to_copy);
-                       if (err) {
-                               ret = -EFAULT;
-                               goto out;
-                       }
-
-                       pos += bytes_to_copy;
-                       str += bytes_to_copy;
-                       len -= bytes_to_copy;
-               }
-       }
-       ret = 0;
-out:
-       if (kmapped_page)
-               kunmap(kmapped_page);
-       return ret;
-}
-
-/*
- * Like copy_strings, but get argv and its values from kernel memory.
- */
-int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
-{
-       int r;
-       mm_segment_t oldfs = get_fs();
-       set_fs(KERNEL_DS); 
-       r = copy_strings(argc, argv, bprm);
-       set_fs(oldfs);
-       return r; 
-}
-
-/*
- * This routine is used to map in a page into an address space: needed by
- * execve() for the initial stack and environment pages.
- *
- * tsk->mmap_sem is held for writing.
- */
-void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address)
-{
-       pgd_t * pgd;
-       pmd_t * pmd;
-       pte_t * pte;
-       struct vm_area_struct *vma; 
-       pgprot_t prot = PAGE_COPY; 
-
-       if (page_count(page) != 1)
-               printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address);
-       pgd = pgd_offset(tsk->mm, address);
-
-       spin_lock(&tsk->mm->page_table_lock);
-       pmd = pmd_alloc(tsk->mm, pgd, address);
-       if (!pmd)
-               goto out;
-       pte = pte_alloc(tsk->mm, pmd, address);
-       if (!pte)
-               goto out;
-       if (!pte_none(*pte))
-               goto out;
-       lru_cache_add(page);
-       flush_dcache_page(page);
-       flush_page_to_ram(page);
-       /* lookup is cheap because there is only a single entry in the list */
-       vma = find_vma(tsk->mm, address);
-       if (vma)
-               prot = vma->vm_page_prot;
-       set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot))));
-       XEN_flush_page_update_queue();
-       tsk->mm->rss++;
-       spin_unlock(&tsk->mm->page_table_lock);
-
-       /* no need for flush_tlb */
-       return;
-out:
-       spin_unlock(&tsk->mm->page_table_lock);
-       __free_page(page);
-       force_sig(SIGKILL, tsk);
-       return;
-}
-
-int setup_arg_pages(struct linux_binprm *bprm)
-{
-       unsigned long stack_base;
-       struct vm_area_struct *mpnt;
-       int i, ret;
-
-       stack_base = STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE;
-
-       bprm->p += stack_base;
-       if (bprm->loader)
-               bprm->loader += stack_base;
-       bprm->exec += stack_base;
-
-       mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
-       if (!mpnt) 
-               return -ENOMEM; 
-       
-       down_write(&current->mm->mmap_sem);
-       {
-               mpnt->vm_mm = current->mm;
-               mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
-               mpnt->vm_end = STACK_TOP;
-               mpnt->vm_flags = VM_STACK_FLAGS;
-               mpnt->vm_page_prot = protection_map[VM_STACK_FLAGS & 0x7];
-               mpnt->vm_ops = NULL;
-               mpnt->vm_pgoff = 0;
-               mpnt->vm_file = NULL;
-               mpnt->vm_private_data = (void *) 0;
-               if ((ret = insert_vm_struct(current->mm, mpnt))) {
-                       up_write(&current->mm->mmap_sem);
-                       kmem_cache_free(vm_area_cachep, mpnt);
-                       return ret;
-               }
-               current->mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
-       } 
-
-       for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
-               struct page *page = bprm->page[i];
-               if (page) {
-                       bprm->page[i] = NULL;
-                       put_dirty_page(current,page,stack_base);
-               }
-               stack_base += PAGE_SIZE;
-       }
-       up_write(&current->mm->mmap_sem);
-       
-       return 0;
-}
-
-struct file *open_exec(const char *name)
-{
-       struct nameidata nd;
-       struct inode *inode;
-       struct file *file;
-       int err = 0;
-
-       err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd);
-       file = ERR_PTR(err);
-       if (!err) {
-               inode = nd.dentry->d_inode;
-               file = ERR_PTR(-EACCES);
-               if (!(nd.mnt->mnt_flags & MNT_NOEXEC) &&
-                   S_ISREG(inode->i_mode)) {
-                       int err = permission(inode, MAY_EXEC);
-                       if (!err && !(inode->i_mode & 0111))
-                               err = -EACCES;
-                       file = ERR_PTR(err);
-                       if (!err) {
-                               file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
-                               if (!IS_ERR(file)) {
-                                       err = deny_write_access(file);
-                                       if (err) {
-                                               fput(file);
-                                               file = ERR_PTR(err);
-                                       }
-                               }
-out:
-                               return file;
-                       }
-               }
-               path_release(&nd);
-       }
-       goto out;
-}
-
-int kernel_read(struct file *file, unsigned long offset,
-       char * addr, unsigned long count)
-{
-       mm_segment_t old_fs;
-       loff_t pos = offset;
-       int result = -ENOSYS;
-
-       if (!file->f_op->read)
-               goto fail;
-       old_fs = get_fs();
-       set_fs(get_ds());
-       result = file->f_op->read(file, addr, count, &pos);
-       set_fs(old_fs);
-fail:
-       return result;
-}
-
-static int exec_mmap(void)
-{
-       struct mm_struct * mm, * old_mm;
-
-       old_mm = current->mm;
-
-       if (old_mm && atomic_read(&old_mm->mm_users) == 1) {
-               mm_release();
-               down_write(&old_mm->mmap_sem);
-               exit_mmap(old_mm);
-               up_write(&old_mm->mmap_sem);
-               return 0;
-       }
-
-
-       mm = mm_alloc();
-       if (mm) {
-               struct mm_struct *active_mm;
-
-               if (init_new_context(current, mm)) {
-                       mmdrop(mm);
-                       return -ENOMEM;
-               }
-
-               /* Add it to the list of mm's */
-               spin_lock(&mmlist_lock);
-               list_add(&mm->mmlist, &init_mm.mmlist);
-               mmlist_nr++;
-               spin_unlock(&mmlist_lock);
-
-               task_lock(current);
-               active_mm = current->active_mm;
-               current->mm = mm;
-               current->active_mm = mm;
-               task_unlock(current);
-               activate_mm(active_mm, mm);
-               mm_release();
-               if (old_mm) {
-                       if (active_mm != old_mm) BUG();
-                       mmput(old_mm);
-                       return 0;
-               }
-               mmdrop(active_mm);
-               return 0;
-       }
-       return -ENOMEM;
-}
-
-/*
- * This function makes sure the current process has its own signal table,
- * so that flush_signal_handlers can later reset the handlers without
- * disturbing other processes.  (Other processes might share the signal
- * table via the CLONE_SIGNAL option to clone().)
- */
-static inline int make_private_signals(void)
-{
-       struct signal_struct * newsig;
-
-       if (atomic_read(&current->sig->count) <= 1)
-               return 0;
-       newsig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
-       if (newsig == NULL)
-               return -ENOMEM;
-       spin_lock_init(&newsig->siglock);
-       atomic_set(&newsig->count, 1);
-       memcpy(newsig->action, current->sig->action, sizeof(newsig->action));
-       spin_lock_irq(&current->sigmask_lock);
-       current->sig = newsig;
-       spin_unlock_irq(&current->sigmask_lock);
-       return 0;
-}
-       
-/*
- * If make_private_signals() made a copy of the signal table, decrement the
- * refcount of the original table, and free it if necessary.
- * We don't do that in make_private_signals() so that we can back off
- * in flush_old_exec() if an error occurs after calling make_private_signals().
- */
-
-static inline void release_old_signals(struct signal_struct * oldsig)
-{
-       if (current->sig == oldsig)
-               return;
-       if (atomic_dec_and_test(&oldsig->count))
-               kmem_cache_free(sigact_cachep, oldsig);
-}
-
-/*
- * These functions flushes out all traces of the currently running executable
- * so that a new one can be started
- */
-
-static inline void flush_old_files(struct files_struct * files)
-{
-       long j = -1;
-
-       write_lock(&files->file_lock);
-       for (;;) {
-               unsigned long set, i;
-
-               j++;
-               i = j * __NFDBITS;
-               if (i >= files->max_fds || i >= files->max_fdset)
-                       break;
-               set = files->close_on_exec->fds_bits[j];
-               if (!set)
-                       continue;
-               files->close_on_exec->fds_bits[j] = 0;
-               write_unlock(&files->file_lock);
-               for ( ; set ; i++,set >>= 1) {
-                       if (set & 1) {
-                               sys_close(i);
-                       }
-               }
-               write_lock(&files->file_lock);
-
-       }
-       write_unlock(&files->file_lock);
-}
-
-/*
- * An execve() will automatically "de-thread" the process.
- * Note: we don't have to hold the tasklist_lock to test
- * whether we migth need to do this. If we're not part of
- * a thread group, there is no way we can become one
- * dynamically. And if we are, we only need to protect the
- * unlink - even if we race with the last other thread exit,
- * at worst the list_del_init() might end up being a no-op.
- */
-static inline void de_thread(struct task_struct *tsk)
-{
-       if (!list_empty(&tsk->thread_group)) {
-               write_lock_irq(&tasklist_lock);
-               list_del_init(&tsk->thread_group);
-               write_unlock_irq(&tasklist_lock);
-       }
-
-       /* Minor oddity: this might stay the same. */
-       tsk->tgid = tsk->pid;
-}
-
-void get_task_comm(char *buf, struct task_struct *tsk)
-{
-       /* buf must be at least sizeof(tsk->comm) in size */
-       task_lock(tsk);
-       memcpy(buf, tsk->comm, sizeof(tsk->comm));
-       task_unlock(tsk);
-}
-
-void set_task_comm(struct task_struct *tsk, char *buf)
-{
-       task_lock(tsk);
-       strncpy(tsk->comm, buf, sizeof(tsk->comm));
-       tsk->comm[sizeof(tsk->comm)-1]='\0';
-       task_unlock(tsk);
-}
-
-int flush_old_exec(struct linux_binprm * bprm)
-{
-       char * name;
-       int i, ch, retval;
-       struct signal_struct * oldsig;
-       struct files_struct * files;
-       char tcomm[sizeof(current->comm)];
-
-       /*
-        * Make sure we have a private signal table
-        */
-       oldsig = current->sig;
-       retval = make_private_signals();
-       if (retval) goto flush_failed;
-
-       /*
-        * Make sure we have private file handles. Ask the
-        * fork helper to do the work for us and the exit
-        * helper to do the cleanup of the old one.
-        */
-        
-       files = current->files;         /* refcounted so safe to hold */
-       retval = unshare_files();
-       if(retval)
-               goto flush_failed;
-       
-       /* 
-        * Release all of the old mmap stuff
-        */
-       retval = exec_mmap();
-       if (retval) goto mmap_failed;
-
-       /* This is the point of no return */
-       steal_locks(files);
-       put_files_struct(files);
-       release_old_signals(oldsig);
-
-       current->sas_ss_sp = current->sas_ss_size = 0;
-
-       if (current->euid == current->uid && current->egid == current->gid) {
-               current->mm->dumpable = 1;
-               current->task_dumpable = 1;
-       }
-       name = bprm->filename;
-       for (i=0; (ch = *(name++)) != '\0';) {
-               if (ch == '/')
-                       i = 0;
-               else
-                       if (i < (sizeof(tcomm) - 1))
-                               tcomm[i++] = ch;
-       }
-       tcomm[i] = '\0';
-       set_task_comm(current, tcomm);
-
-       flush_thread();
-
-       de_thread(current);
-
-       if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || 
-           permission(bprm->file->f_dentry->d_inode,MAY_READ))
-               current->mm->dumpable = 0;
-
-       /* An exec changes our domain. We are no longer part of the thread
-          group */
-          
-       current->self_exec_id++;
-                       
-       flush_signal_handlers(current);
-       flush_old_files(current->files);
-
-       return 0;
-
-mmap_failed:
-       put_files_struct(current->files);
-       current->files = files;
-flush_failed:
-       spin_lock_irq(&current->sigmask_lock);
-       if (current->sig != oldsig) {
-               kmem_cache_free(sigact_cachep, current->sig);
-               current->sig = oldsig;
-       }
-       spin_unlock_irq(&current->sigmask_lock);
-       return retval;
-}
-
-/*
- * We mustn't allow tracing of suid binaries, unless
- * the tracer has the capability to trace anything..
- */
-static inline int must_not_trace_exec(struct task_struct * p)
-{
-       return (p->ptrace & PT_PTRACED) && !(p->ptrace & PT_PTRACE_CAP);
-}
-
-/* 
- * Fill the binprm structure from the inode. 
- * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
- */
-int prepare_binprm(struct linux_binprm *bprm)
-{
-       int mode;
-       struct inode * inode = bprm->file->f_dentry->d_inode;
-
-       mode = inode->i_mode;
-       /*
-        * Check execute perms again - if the caller has CAP_DAC_OVERRIDE,
-        * vfs_permission lets a non-executable through
-        */
-       if (!(mode & 0111))     /* with at least _one_ execute bit set */
-               return -EACCES;
-       if (bprm->file->f_op == NULL)
-               return -EACCES;
-
-       bprm->e_uid = current->euid;
-       bprm->e_gid = current->egid;
-
-       if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) {
-               /* Set-uid? */
-               if (mode & S_ISUID)
-                       bprm->e_uid = inode->i_uid;
-
-               /* Set-gid? */
-               /*
-                * If setgid is set but no group execute bit then this
-                * is a candidate for mandatory locking, not a setgid
-                * executable.
-                */
-               if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
-                       bprm->e_gid = inode->i_gid;
-       }
-
-       /* We don't have VFS support for capabilities yet */
-       cap_clear(bprm->cap_inheritable);
-       cap_clear(bprm->cap_permitted);
-       cap_clear(bprm->cap_effective);
-
-       /*  To support inheritance of root-permissions and suid-root
-         *  executables under compatibility mode, we raise all three
-         *  capability sets for the file.
-         *
-         *  If only the real uid is 0, we only raise the inheritable
-         *  and permitted sets of the executable file.
-         */
-
-       if (!issecure(SECURE_NOROOT)) {
-               if (bprm->e_uid == 0 || current->uid == 0) {
-                       cap_set_full(bprm->cap_inheritable);
-                       cap_set_full(bprm->cap_permitted);
-               }
-               if (bprm->e_uid == 0) 
-                       cap_set_full(bprm->cap_effective);
-       }
-
-       memset(bprm->buf,0,BINPRM_BUF_SIZE);
-       return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE);
-}
-
-/*
- * This function is used to produce the new IDs and capabilities
- * from the old ones and the file's capabilities.
- *
- * The formula used for evolving capabilities is:
- *
- *       pI' = pI
- * (***) pP' = (fP & X) | (fI & pI)
- *       pE' = pP' & fE          [NB. fE is 0 or ~0]
- *
- * I=Inheritable, P=Permitted, E=Effective // p=process, f=file
- * ' indicates post-exec(), and X is the global 'cap_bset'.
- *
- */
-
-void compute_creds(struct linux_binprm *bprm) 
-{
-       kernel_cap_t new_permitted, working;
-       int do_unlock = 0;
-
-       new_permitted = cap_intersect(bprm->cap_permitted, cap_bset);
-       working = cap_intersect(bprm->cap_inheritable,
-                               current->cap_inheritable);
-       new_permitted = cap_combine(new_permitted, working);
-
-       if (bprm->e_uid != current->uid || bprm->e_gid != current->gid ||
-           !cap_issubset(new_permitted, current->cap_permitted)) {
-                current->mm->dumpable = 0;
-               
-               lock_kernel();
-               if (must_not_trace_exec(current)
-                   || atomic_read(&current->fs->count) > 1
-                   || atomic_read(&current->files->count) > 1
-                   || atomic_read(&current->sig->count) > 1) {
-                       if(!capable(CAP_SETUID)) {
-                               bprm->e_uid = current->uid;
-                               bprm->e_gid = current->gid;
-                       }
-                       if(!capable(CAP_SETPCAP)) {
-                               new_permitted = cap_intersect(new_permitted,
-                                                       current->cap_permitted);
-                       }
-               }
-               do_unlock = 1;
-       }
-
-
-       /* For init, we want to retain the capabilities set
-         * in the init_task struct. Thus we skip the usual
-         * capability rules */
-       if (current->pid != 1) {
-               current->cap_permitted = new_permitted;
-               current->cap_effective =
-                       cap_intersect(new_permitted, bprm->cap_effective);
-       }
-       
-        /* AUD: Audit candidate if current->cap_effective is set */
-
-        current->suid = current->euid = current->fsuid = bprm->e_uid;
-        current->sgid = current->egid = current->fsgid = bprm->e_gid;
-
-       if(do_unlock)
-               unlock_kernel();
-       current->keep_capabilities = 0;
-}
-
-
-void remove_arg_zero(struct linux_binprm *bprm)
-{
-       if (bprm->argc) {
-               unsigned long offset;
-               char * kaddr;
-               struct page *page;
-
-               offset = bprm->p % PAGE_SIZE;
-               goto inside;
-
-               while (bprm->p++, *(kaddr+offset++)) {
-                       if (offset != PAGE_SIZE)
-                               continue;
-                       offset = 0;
-                       kunmap(page);
-inside:
-                       page = bprm->page[bprm->p/PAGE_SIZE];
-                       kaddr = kmap(page);
-               }
-               kunmap(page);
-               bprm->argc--;
-       }
-}
-
-/*
- * cycle the list of binary formats handler, until one recognizes the image
- */
-int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
-{
-       int try,retval=0;
-       struct linux_binfmt *fmt;
-#ifdef __alpha__
-       /* handle /sbin/loader.. */
-       {
-           struct exec * eh = (struct exec *) bprm->buf;
-
-           if (!bprm->loader && eh->fh.f_magic == 0x183 &&
-               (eh->fh.f_flags & 0x3000) == 0x3000)
-           {
-               struct file * file;
-               unsigned long loader;
-
-               allow_write_access(bprm->file);
-               fput(bprm->file);
-               bprm->file = NULL;
-
-               loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
-
-               file = open_exec("/sbin/loader");
-               retval = PTR_ERR(file);
-               if (IS_ERR(file))
-                       return retval;
-
-               /* Remember if the application is TASO.  */
-               bprm->sh_bang = eh->ah.entry < 0x100000000;
-
-               bprm->file = file;
-               bprm->loader = loader;
-               retval = prepare_binprm(bprm);
-               if (retval<0)
-                       return retval;
-               /* should call search_binary_handler recursively here,
-                  but it does not matter */
-           }
-       }
-#endif
-       /* kernel module loader fixup */
-       /* so we don't try to load run modprobe in kernel space. */
-       set_fs(USER_DS);
-       for (try=0; try<2; try++) {
-               read_lock(&binfmt_lock);
-               for (fmt = formats ; fmt ; fmt = fmt->next) {
-                       int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
-                       if (!fn)
-                               continue;
-                       if (!try_inc_mod_count(fmt->module))
-                               continue;
-                       read_unlock(&binfmt_lock);
-                       retval = fn(bprm, regs);
-                       if (retval >= 0) {
-                               put_binfmt(fmt);
-                               allow_write_access(bprm->file);
-                               if (bprm->file)
-                                       fput(bprm->file);
-                               bprm->file = NULL;
-                               current->did_exec = 1;
-                               return retval;
-                       }
-                       read_lock(&binfmt_lock);
-                       put_binfmt(fmt);
-                       if (retval != -ENOEXEC)
-                               break;
-                       if (!bprm->file) {
-                               read_unlock(&binfmt_lock);
-                               return retval;
-                       }
-               }
-               read_unlock(&binfmt_lock);
-               if (retval != -ENOEXEC) {
-                       break;
-#ifdef CONFIG_KMOD
-               }else{
-#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
-                       char modname[20];
-                       if (printable(bprm->buf[0]) &&
-                           printable(bprm->buf[1]) &&
-                           printable(bprm->buf[2]) &&
-                           printable(bprm->buf[3]))
-                               break; /* -ENOEXEC */
-                       sprintf(modname, "binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
-                       request_module(modname);
-#endif
-               }
-       }
-       return retval;
-}
-
-
-/*
- * sys_execve() executes a new program.
- */
-int do_execve(char * filename, char ** argv, char ** envp, struct pt_regs * regs)
-{
-       struct linux_binprm bprm;
-       struct file *file;
-       int retval;
-       int i;
-
-       file = open_exec(filename);
-
-       retval = PTR_ERR(file);
-       if (IS_ERR(file))
-               return retval;
-
-       bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
-       memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0])); 
-
-       bprm.file = file;
-       bprm.filename = filename;
-       bprm.sh_bang = 0;
-       bprm.loader = 0;
-       bprm.exec = 0;
-       if ((bprm.argc = count(argv, bprm.p / sizeof(void *))) < 0) {
-               allow_write_access(file);
-               fput(file);
-               return bprm.argc;
-       }
-
-       if ((bprm.envc = count(envp, bprm.p / sizeof(void *))) < 0) {
-               allow_write_access(file);
-               fput(file);
-               return bprm.envc;
-       }
-
-       retval = prepare_binprm(&bprm);
-       if (retval < 0) 
-               goto out; 
-
-       retval = copy_strings_kernel(1, &bprm.filename, &bprm);
-       if (retval < 0) 
-               goto out; 
-
-       bprm.exec = bprm.p;
-       retval = copy_strings(bprm.envc, envp, &bprm);
-       if (retval < 0) 
-               goto out; 
-
-       retval = copy_strings(bprm.argc, argv, &bprm);
-       if (retval < 0) 
-               goto out; 
-
-       retval = search_binary_handler(&bprm,regs);
-       if (retval >= 0)
-               /* execve success */
-               return retval;
-
-out:
-       /* Something went wrong, return the inode and free the argument pages*/
-       allow_write_access(bprm.file);
-       if (bprm.file)
-               fput(bprm.file);
-
-       for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
-               struct page * page = bprm.page[i];
-               if (page)
-                       __free_page(page);
-       }
-
-       return retval;
-}
-
-void set_binfmt(struct linux_binfmt *new)
-{
-       struct linux_binfmt *old = current->binfmt;
-       if (new && new->module)
-               __MOD_INC_USE_COUNT(new->module);
-       current->binfmt = new;
-       if (old && old->module)
-               __MOD_DEC_USE_COUNT(old->module);
-}
-
-#define CORENAME_MAX_SIZE 64
-
-/* format_corename will inspect the pattern parameter, and output a
- * name into corename, which must have space for at least
- * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
- */
-void format_corename(char *corename, const char *pattern, long signr)
-{
-       const char *pat_ptr = pattern;
-       char *out_ptr = corename;
-       char *const out_end = corename + CORENAME_MAX_SIZE;
-       int rc;
-       int pid_in_pattern = 0;
-
-       /* Repeat as long as we have more pattern to process and more output
-          space */
-       while (*pat_ptr) {
-               if (*pat_ptr != '%') {
-                       if (out_ptr == out_end)
-                               goto out;
-                       *out_ptr++ = *pat_ptr++;
-               } else {
-                       switch (*++pat_ptr) {
-                       case 0:
-                               goto out;
-                       /* Double percent, output one percent */
-                       case '%':
-                               if (out_ptr == out_end)
-                                       goto out;
-                               *out_ptr++ = '%';
-                               break;
-                       /* pid */
-                       case 'p':
-                               pid_in_pattern = 1;
-                               rc = snprintf(out_ptr, out_end - out_ptr,
-                                             "%d", current->pid);
-                               if (rc > out_end - out_ptr)
-                                       goto out;
-                               out_ptr += rc;
-                               break;
-                       /* uid */
-                       case 'u':
-                               rc = snprintf(out_ptr, out_end - out_ptr,
-                                             "%d", current->uid);
-                               if (rc > out_end - out_ptr)
-                                       goto out;
-                               out_ptr += rc;
-                               break;
-                       /* gid */
-                       case 'g':
-                               rc = snprintf(out_ptr, out_end - out_ptr,
-                                             "%d", current->gid);
-                               if (rc > out_end - out_ptr)
-                                       goto out;
-                               out_ptr += rc;
-                               break;
-                       /* signal that caused the coredump */
-                       case 's':
-                               rc = snprintf(out_ptr, out_end - out_ptr,
-                                             "%ld", signr);
-                               if (rc > out_end - out_ptr)
-                                       goto out;
-                               out_ptr += rc;
-                               break;
-                       /* UNIX time of coredump */
-                       case 't': {
-                               struct timeval tv;
-                               do_gettimeofday(&tv);
-                               rc = snprintf(out_ptr, out_end - out_ptr,
-                                             "%ld", tv.tv_sec);
-                               if (rc > out_end - out_ptr)
-                                       goto out;
-                               out_ptr += rc;
-                               break;
-                       }
-                       /* hostname */
-                       case 'h':
-                               down_read(&uts_sem);
-                               rc = snprintf(out_ptr, out_end - out_ptr,
-                                             "%s", system_utsname.nodename);
-                               up_read(&uts_sem);
-                               if (rc > out_end - out_ptr)
-                                       goto out;
-                               out_ptr += rc;
-                               break;
-                       /* executable */
-                       case 'e':
-                               rc = snprintf(out_ptr, out_end - out_ptr,
-                                             "%s", current->comm);
-                               if (rc > out_end - out_ptr)
-                                       goto out;
-                               out_ptr += rc;
-                               break;
-                       default:
-                               break;
-                       }
-                       ++pat_ptr;
-               }
-       }
-       /* Backward compatibility with core_uses_pid:
-        *
-        * If core_pattern does not include a %p (as is the default)
-        * and core_uses_pid is set, then .%pid will be appended to
-        * the filename */
-       if (!pid_in_pattern
-            && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
-               rc = snprintf(out_ptr, out_end - out_ptr,
-                             ".%d", current->pid);
-               if (rc > out_end - out_ptr)
-                       goto out;
-               out_ptr += rc;
-       }
-      out:
-       *out_ptr = 0;
-}
-
-int do_coredump(long signr, struct pt_regs * regs)
-{
-       struct linux_binfmt * binfmt;
-       char corename[CORENAME_MAX_SIZE + 1];
-       struct file * file;
-       struct inode * inode;
-       int retval = 0;
-       int fsuid = current->fsuid;
-
-       lock_kernel();
-       binfmt = current->binfmt;
-       if (!binfmt || !binfmt->core_dump)
-               goto fail;
-       if (!is_dumpable(current))
-       {
-               if(!core_setuid_ok || !current->task_dumpable)
-                       goto fail;
-               current->fsuid = 0;
-       }
-       current->mm->dumpable = 0;
-       if (current->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
-               goto fail;
-
-       format_corename(corename, core_pattern, signr);
-       file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW, 0600);
-       if (IS_ERR(file))
-               goto fail;
-       inode = file->f_dentry->d_inode;
-       if (inode->i_nlink > 1)
-               goto close_fail;        /* multiple links - don't dump */
-       if (d_unhashed(file->f_dentry))
-               goto close_fail;
-
-       if (!S_ISREG(inode->i_mode))
-               goto close_fail;
-       if (!file->f_op)
-               goto close_fail;
-       if (!file->f_op->write)
-               goto close_fail;
-       if (do_truncate(file->f_dentry, 0) != 0)
-               goto close_fail;
-
-       retval = binfmt->core_dump(signr, regs, file);
-
-close_fail:
-       filp_close(file, NULL);
-fail:
-       if (fsuid != current->fsuid)
-               current->fsuid = fsuid;
-       unlock_kernel();
-       return retval;
-}
index fbab7f5ff1b926c207361fbff9db574e003c2e90..3150545429f83c970e06102df4bfbb26ffd91e70 100644 (file)
@@ -85,23 +85,18 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 static inline unsigned long pmd_val(pmd_t x)
 {
     unsigned long ret = x.pmd;
-    if ( (ret & 1) ) ret = machine_to_phys(ret);
+    if ( ret ) ret = machine_to_phys(ret) | 1;
     return ret;
 }
 #define pmd_val_ma(x)   ((x).pmd)
 #define pgd_val(x)     ({ BUG(); (unsigned long)0; })
 #define pgprot_val(x)  ((x).pgprot)
 
-static inline pte_t __pte(unsigned long x)
-{
-    if ( (x & 1) ) x = phys_to_machine(x);
-    return ((pte_t) { (x) });
-}
-static inline pmd_t __pmd(unsigned long x)
-{
-    if ( (x & 1) ) x = phys_to_machine(x);
-    return ((pmd_t) { (x) });
-}
+#define __pte(x) ({ unsigned long _x = (x); \
+    (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); })
+#define __pte_ma(x)     ((pte_t) { (x) } )
+#define __pmd(x) ({ unsigned long _x = (x); \
+    (((_x)&1) ? ((pmd_t) {phys_to_machine(_x)}) : ((pmd_t) {(_x)})); })
 #define __pgd(x) ({ BUG(); (pgprot_t) { 0 }; })
 #define __pgprot(x)    ((pgprot_t) { (x) } )
 
index 4e9584e918cc06efed77441dc64b696f6c51ec83..2a0c226c71f6d221730cc83108583e9a96ec93b3 100644 (file)
@@ -22,7 +22,6 @@
 #define pmd_populate(mm, pmd, pte)               \
  do {                                             \
   set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)));   \
-  XEN_flush_page_update_queue();                 \
  } while ( 0 )
 
 /*
@@ -79,8 +78,9 @@ static inline pgd_t *get_pgd_slow(void)
                memcpy(pgd + USER_PTRS_PER_PGD,
                        init_mm.pgd + USER_PTRS_PER_PGD,
                        (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
-                __make_page_readonly(pgd);
+               __make_page_readonly(pgd);
                queue_pgd_pin(__pa(pgd));
+               flush_page_update_queue();
        }
        return pgd;
 }
@@ -111,7 +111,8 @@ static inline void free_pgd_slow(pgd_t *pgd)
        kmem_cache_free(pae_pgd_cachep, pgd);
 #else
        queue_pgd_unpin(__pa(pgd));
-        __make_page_writable(pgd);
+       __make_page_writable(pgd);
+       flush_page_update_queue();
        free_page((unsigned long)pgd);
 #endif
 }
@@ -135,6 +136,7 @@ static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address)
         clear_page(pte);
         __make_page_readonly(pte);
         queue_pte_pin(__pa(pte));
+        flush_page_update_queue();
     }
     return pte;
 
@@ -155,6 +157,7 @@ static __inline__ void pte_free_slow(pte_t *pte)
 {
     queue_pte_unpin(__pa(pte));
     __make_page_writable(pte);
+    flush_page_update_queue();
     free_page((unsigned long)pte);
 }
 
@@ -208,22 +211,19 @@ extern int do_check_pgt_cache(int, int);
 
 static inline void flush_tlb_mm(struct mm_struct *mm)
 {
-       if (mm == current->active_mm) queue_tlb_flush();
-       XEN_flush_page_update_queue();
+       if (mm == current->active_mm) xen_tlb_flush();
 }
 
 static inline void flush_tlb_page(struct vm_area_struct *vma,
        unsigned long addr)
 {
-       if (vma->vm_mm == current->active_mm) queue_invlpg(addr);
-       XEN_flush_page_update_queue();
+       if (vma->vm_mm == current->active_mm) xen_invlpg(addr);
 }
 
 static inline void flush_tlb_range(struct mm_struct *mm,
        unsigned long start, unsigned long end)
 {
-       if (mm == current->active_mm) queue_tlb_flush();
-       XEN_flush_page_update_queue();
+       if (mm == current->active_mm) xen_tlb_flush();
 }
 
 #else
@@ -261,7 +261,6 @@ static inline void flush_tlb_pgtables(struct mm_struct *mm,
                                      unsigned long start, unsigned long end)
 {
     /* i386 does not keep any page table caches in TLB */
-    XEN_flush_page_update_queue();
 }
 
 /*
index d91b48360e2dcc9aaa90f634e03fc3b2b3557603..70f8356fb1dbf169f221c7d813b15a605e009c04 100644 (file)
@@ -34,9 +34,19 @@ static inline int pgd_bad(pgd_t pgd)         { return 0; }
 static inline int pgd_present(pgd_t pgd)       { return 1; }
 #define pgd_clear(xp)                          do { } while (0)
 
-#define set_pte(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low)
-#define set_pte_atomic(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low)
-#define set_pmd(pmdptr, pmdval) queue_l2_entry_update((pmdptr), (pmdval))
+/*
+ * Certain architectures need to do special things when PTEs
+ * within a page table are directly modified.  Thus, the following
+ * hook is made available.
+ */
+#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
+#define set_pte_atomic(pteptr, pteval) (*(pteptr) = pteval)
+
+/*
+ * (pmds are folded into pgds so this doesnt get actually called,
+ * but the define is needed for a generic inline function.)
+ */
+#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
 #define set_pgd(pgdptr, pgdval) ((void)0)
 
 #define pgd_page(pgd) \
@@ -47,6 +57,7 @@ static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address)
        return (pmd_t *) dir;
 }
 
+#define ptep_get_and_clear(xp) __pte_ma(xchg(&(xp)->pte_low, 0))
 #define pte_same(a, b)         ((a).pte_low == (b).pte_low)
 
 /*                                 
@@ -83,21 +94,4 @@ static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address)
 #define pte_none(x)            (!(x).pte_low)
 #define __mk_pte(page_nr,pgprot) __pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot))
 
-/*
- * A note on implementation of this atomic 'get-and-clear' operation.
- * This is actually very simple because XenoLinux can only run on a single
- * processor. Therefore, we cannot race other processors setting the 'accessed'
- * or 'dirty' bits on a page-table entry.
- * Even if pages are shared between domains, that is not a problem because
- * each domain will have separate page tables, with their own versions of
- * accessed & dirty state.
- */
-static inline pte_t ptep_get_and_clear(pte_t *xp)
-{
-    pte_t pte = *xp;
-    if ( !pte_none(pte) )
-        queue_l1_entry_update(xp, 0);
-    return pte;
-}
-
 #endif /* _I386_PGTABLE_2LEVEL_H */
index c15f0e9509b4e480e980bc0f43e933bafd613392..f5a53adc82c809a5a5e4210bad59db73b159cc92 100644 (file)
@@ -38,11 +38,11 @@ extern void paging_init(void);
 
 extern unsigned long pgkern_mask;
 
-#define __flush_tlb() ({ queue_tlb_flush(); XEN_flush_page_update_queue(); })
+#define __flush_tlb() xen_tlb_flush()
 #define __flush_tlb_global() __flush_tlb()
 #define __flush_tlb_all() __flush_tlb_global()
-#define __flush_tlb_one(addr) ({ queue_invlpg(addr); XEN_flush_page_update_queue(); })
-#define __flush_tlb_single(addr) ({ queue_invlpg(addr); XEN_flush_page_update_queue(); })
+#define __flush_tlb_one(addr) xen_invlpg(addr)
+#define __flush_tlb_single(addr) xen_invlpg(addr)
 
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
@@ -179,12 +179,14 @@ extern void * high_memory;
 #define __S111 PAGE_SHARED
 
 #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
-#define pte_clear(xp)  queue_l1_entry_update(xp, 0)
+#define pte_clear(xp)  do { set_pte(xp, __pte(0)); } while (0)
 
-#define pmd_none(x)    (!(x).pmd)
-#define pmd_present(x) ((x).pmd & _PAGE_PRESENT)
+#define pmd_none(x)    (!pmd_val(x))
+/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
+   can temporarily clear it. */
+#define pmd_present(x) (pmd_val(x))
 #define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
-#define        pmd_bad(x)      (((x).pmd & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
+#define pmd_bad(x)     ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
 
 
 #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
@@ -212,29 +214,28 @@ static inline pte_t pte_mkwrite(pte_t pte)        { (pte).pte_low |= _PAGE_RW; return p
 
 static inline int ptep_test_and_clear_dirty(pte_t *ptep)
 {
-    unsigned long pteval = *(unsigned long *)ptep;
-    int ret = pteval & _PAGE_DIRTY;
-    if ( ret ) queue_l1_entry_update(ptep, pteval & ~_PAGE_DIRTY);
-    return ret;
+    if (!pte_dirty(*ptep))
+        return 0;
+    return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low);
 }
-static inline  int ptep_test_and_clear_young(pte_t *ptep)
+
+static inline int ptep_test_and_clear_young(pte_t *ptep)
 {
-    unsigned long pteval = *(unsigned long *)ptep;
-    int ret = pteval & _PAGE_ACCESSED;
-    if ( ret ) queue_l1_entry_update(ptep, pteval & ~_PAGE_ACCESSED);
-    return ret;
+    if (!pte_young(*ptep))
+        return 0;
+    return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low);
 }
+
 static inline void ptep_set_wrprotect(pte_t *ptep)
 {
-    unsigned long pteval = *(unsigned long *)ptep;
-    if ( (pteval & _PAGE_RW) )
-        queue_l1_entry_update(ptep, pteval & ~_PAGE_RW);
+    if (pte_write(*ptep))
+        clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
 }
+
 static inline void ptep_mkdirty(pte_t *ptep)
 {
-    unsigned long pteval = *(unsigned long *)ptep;
-    if ( !(pteval & _PAGE_DIRTY) )
-        queue_l1_entry_update(ptep, pteval | _PAGE_DIRTY);
+    if (!pte_dirty(*ptep))
+        set_bit(_PAGE_BIT_DIRTY, &ptep->pte_low);
 }
 
 /*
index 341e6e29a901def96cfcdd74cae1f7d5c591c64f..f8182820ac2241f7568415b62d0c7b3e85b04156 100644 (file)
@@ -122,7 +122,6 @@ start:
        }
        vaddr = PKMAP_ADDR(last_pkmap_nr);
        set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
-       XEN_flush_page_update_queue();
 
        pkmap_count[last_pkmap_nr] = 1;
        page->virtual = (void *) vaddr;
index 7d81c86589c5550448163fc8d5d1f18a782725c3..880b6981c4c87e4700794b22533e8db414eed7ed 100644 (file)
@@ -153,7 +153,6 @@ void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
                free_one_pgd(page_dir);
                page_dir++;
        } while (--nr);
-       XEN_flush_page_update_queue();
        spin_unlock(&mm->page_table_lock);
 
        /* keep the page table cache within bounds */
@@ -249,10 +248,8 @@ skip_copy_pte_range:               address = (address + PMD_SIZE) & PMD_MASK;
 
                                /* If it's a COW mapping, write protect it both in the parent and the child */
                                if (cow && pte_write(pte)) {
-                                       /* XEN modification: modified ordering here to avoid RaW hazard. */
-                                       pte = *src_pte;
-                                       pte = pte_wrprotect(pte);
                                        ptep_set_wrprotect(src_pte);
+                                       pte = *src_pte;
                                }
 
                                /* If it's a shared mapping, mark it clean in the child */
@@ -914,7 +911,6 @@ static inline void establish_pte(struct vm_area_struct * vma, unsigned long addr
 {
 #ifdef CONFIG_XEN
        if ( likely(vma->vm_mm == current->mm) ) {
-               XEN_flush_page_update_queue();
                HYPERVISOR_update_va_mapping(address, entry, UVMF_INVLPG);
        } else {
                set_pte(page_table, entry);
@@ -1189,13 +1185,10 @@ static int do_swap_page(struct mm_struct * mm,
        flush_page_to_ram(page);
        flush_icache_page(vma, page);
 #ifdef CONFIG_XEN
-       if ( likely(vma->vm_mm == current->mm) ) {
-               XEN_flush_page_update_queue();
+       if ( likely(vma->vm_mm == current->mm) )
                HYPERVISOR_update_va_mapping(address, pte, 0);
-       } else {
+       else
                set_pte(page_table, pte);
-               XEN_flush_page_update_queue();
-       }
 #else
        set_pte(page_table, pte);
 #endif
@@ -1245,13 +1238,10 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma,
        }
 
 #ifdef CONFIG_XEN
-       if ( likely(vma->vm_mm == current->mm) ) {
-               XEN_flush_page_update_queue();
+       if ( likely(vma->vm_mm == current->mm) )
                HYPERVISOR_update_va_mapping(addr, entry, 0);
-       } else {
+       else
                set_pte(page_table, entry);
-               XEN_flush_page_update_queue();
-       }
 #else
        set_pte(page_table, entry);
 #endif
@@ -1331,13 +1321,10 @@ static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
                if (write_access)
                        entry = pte_mkwrite(pte_mkdirty(entry));
 #ifdef CONFIG_XEN
-               if ( likely(vma->vm_mm == current->mm) ) {
-                       XEN_flush_page_update_queue();
+               if ( likely(vma->vm_mm == current->mm) )
                        HYPERVISOR_update_va_mapping(address, entry, 0);
-               } else {
+               else
                        set_pte(page_table, entry);
-                       XEN_flush_page_update_queue();
-               }
 #else
                set_pte(page_table, entry);
 #endif
@@ -1484,7 +1471,6 @@ pte_t fastcall *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long addres
                /* "fast" allocation can happen without dropping the lock.. */
                new = pte_alloc_one_fast(mm, address);
                if (!new) {
-                       XEN_flush_page_update_queue();
                        spin_unlock(&mm->page_table_lock);
                        new = pte_alloc_one(mm, address);
                        spin_lock(&mm->page_table_lock);
index 330e194baed1bc972c0e31bb2bf3daf4d7a62337..475c308b1b6bbecb9af4161bd84ccbdec58f5dc4 100644 (file)
@@ -119,11 +119,9 @@ static int move_page_tables(struct mm_struct * mm,
         * the old page tables)
         */
 oops_we_failed:
-       XEN_flush_page_update_queue();
        flush_cache_range(mm, new_addr, new_addr + len);
        while ((offset += PAGE_SIZE) < len)
                move_one_page(mm, new_addr + offset, old_addr + offset);
-       XEN_flush_page_update_queue();
        zap_page_range(mm, new_addr, len);
        return -1;
 }
diff --git a/linux-2.4.29-xen-sparse/mm/swapfile.c b/linux-2.4.29-xen-sparse/mm/swapfile.c
deleted file mode 100644 (file)
index 6457f19..0000000
+++ /dev/null
@@ -1,1267 +0,0 @@
-/*
- *  linux/mm/swapfile.c
- *
- *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *  Swap reorganised 29.12.95, Stephen Tweedie
- */
-
-#include <linux/slab.h>
-#include <linux/smp_lock.h>
-#include <linux/kernel_stat.h>
-#include <linux/swap.h>
-#include <linux/swapctl.h>
-#include <linux/blkdev.h> /* for blk_size */
-#include <linux/vmalloc.h>
-#include <linux/pagemap.h>
-#include <linux/shm.h>
-
-#include <asm/pgtable.h>
-
-spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
-unsigned int nr_swapfiles;
-int total_swap_pages;
-static int swap_overflow;
-
-static const char Bad_file[] = "Bad swap file entry ";
-static const char Unused_file[] = "Unused swap file entry ";
-static const char Bad_offset[] = "Bad swap offset entry ";
-static const char Unused_offset[] = "Unused swap offset entry ";
-
-struct swap_list_t swap_list = {-1, -1};
-
-struct swap_info_struct swap_info[MAX_SWAPFILES];
-
-#define SWAPFILE_CLUSTER 256
-
-static inline int scan_swap_map(struct swap_info_struct *si)
-{
-       unsigned long offset;
-       /* 
-        * We try to cluster swap pages by allocating them
-        * sequentially in swap.  Once we've allocated
-        * SWAPFILE_CLUSTER pages this way, however, we resort to
-        * first-free allocation, starting a new cluster.  This
-        * prevents us from scattering swap pages all over the entire
-        * swap partition, so that we reduce overall disk seek times
-        * between swap pages.  -- sct */
-       if (si->cluster_nr) {
-               while (si->cluster_next <= si->highest_bit) {
-                       offset = si->cluster_next++;
-                       if (si->swap_map[offset])
-                               continue;
-                       si->cluster_nr--;
-                       goto got_page;
-               }
-       }
-       si->cluster_nr = SWAPFILE_CLUSTER;
-
-       /* try to find an empty (even not aligned) cluster. */
-       offset = si->lowest_bit;
- check_next_cluster:
-       if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
-       {
-               int nr;
-               for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
-                       if (si->swap_map[nr])
-                       {
-                               offset = nr+1;
-                               goto check_next_cluster;
-                       }
-               /* We found a completly empty cluster, so start
-                * using it.
-                */
-               goto got_page;
-       }
-       /* No luck, so now go finegrined as usual. -Andrea */
-       for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
-               if (si->swap_map[offset])
-                       continue;
-               si->lowest_bit = offset+1;
-       got_page:
-               if (offset == si->lowest_bit)
-                       si->lowest_bit++;
-               if (offset == si->highest_bit)
-                       si->highest_bit--;
-               if (si->lowest_bit > si->highest_bit) {
-                       si->lowest_bit = si->max;
-                       si->highest_bit = 0;
-               }
-               si->swap_map[offset] = 1;
-               nr_swap_pages--;
-               si->cluster_next = offset+1;
-               return offset;
-       }
-       si->lowest_bit = si->max;
-       si->highest_bit = 0;
-       return 0;
-}
-
-swp_entry_t get_swap_page(void)
-{
-       struct swap_info_struct * p;
-       unsigned long offset;
-       swp_entry_t entry;
-       int type, wrapped = 0;
-
-       entry.val = 0;  /* Out of memory */
-       swap_list_lock();
-       type = swap_list.next;
-       if (type < 0)
-               goto out;
-       if (nr_swap_pages <= 0)
-               goto out;
-
-       while (1) {
-               p = &swap_info[type];
-               if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
-                       swap_device_lock(p);
-                       offset = scan_swap_map(p);
-                       swap_device_unlock(p);
-                       if (offset) {
-                               entry = SWP_ENTRY(type,offset);
-                               type = swap_info[type].next;
-                               if (type < 0 ||
-                                       p->prio != swap_info[type].prio) {
-                                               swap_list.next = swap_list.head;
-                               } else {
-                                       swap_list.next = type;
-                               }
-                               goto out;
-                       }
-               }
-               type = p->next;
-               if (!wrapped) {
-                       if (type < 0 || p->prio != swap_info[type].prio) {
-                               type = swap_list.head;
-                               wrapped = 1;
-                       }
-               } else
-                       if (type < 0)
-                               goto out;       /* out of swap space */
-       }
-out:
-       swap_list_unlock();
-       return entry;
-}
-
-static struct swap_info_struct * swap_info_get(swp_entry_t entry)
-{
-       struct swap_info_struct * p;
-       unsigned long offset, type;
-
-       if (!entry.val)
-               goto out;
-       type = SWP_TYPE(entry);
-       if (type >= nr_swapfiles)
-               goto bad_nofile;
-       p = & swap_info[type];
-       if (!(p->flags & SWP_USED))
-               goto bad_device;
-       offset = SWP_OFFSET(entry);
-       if (offset >= p->max)
-               goto bad_offset;
-       if (!p->swap_map[offset])
-               goto bad_free;
-       swap_list_lock();
-       if (p->prio > swap_info[swap_list.next].prio)
-               swap_list.next = type;
-       swap_device_lock(p);
-       return p;
-
-bad_free:
-       printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
-       goto out;
-bad_offset:
-       printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
-       goto out;
-bad_device:
-       printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
-       goto out;
-bad_nofile:
-       printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
-out:
-       return NULL;
-}      
-
-static void swap_info_put(struct swap_info_struct * p)
-{
-       swap_device_unlock(p);
-       swap_list_unlock();
-}
-
-static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
-{
-       int count = p->swap_map[offset];
-
-       if (count < SWAP_MAP_MAX) {
-               count--;
-               p->swap_map[offset] = count;
-               if (!count) {
-                       if (offset < p->lowest_bit)
-                               p->lowest_bit = offset;
-                       if (offset > p->highest_bit)
-                               p->highest_bit = offset;
-                       nr_swap_pages++;
-               }
-       }
-       return count;
-}
-
-/*
- * Caller has made sure that the swapdevice corresponding to entry
- * is still around or has not been recycled.
- */
-void swap_free(swp_entry_t entry)
-{
-       struct swap_info_struct * p;
-
-       p = swap_info_get(entry);
-       if (p) {
-               swap_entry_free(p, SWP_OFFSET(entry));
-               swap_info_put(p);
-       }
-}
-
-/*
- * Check if we're the only user of a swap page,
- * when the page is locked.
- */
-static int exclusive_swap_page(struct page *page)
-{
-       int retval = 0;
-       struct swap_info_struct * p;
-       swp_entry_t entry;
-
-       entry.val = page->index;
-       p = swap_info_get(entry);
-       if (p) {
-               /* Is the only swap cache user the cache itself? */
-               if (p->swap_map[SWP_OFFSET(entry)] == 1) {
-                       /* Recheck the page count with the pagecache lock held.. */
-                       spin_lock(&pagecache_lock);
-                       if (page_count(page) - !!page->buffers == 2)
-                               retval = 1;
-                       spin_unlock(&pagecache_lock);
-               }
-               swap_info_put(p);
-       }
-       return retval;
-}
-
-/*
- * We can use this swap cache entry directly
- * if there are no other references to it.
- *
- * Here "exclusive_swap_page()" does the real
- * work, but we opportunistically check whether
- * we need to get all the locks first..
- */
-int fastcall can_share_swap_page(struct page *page)
-{
-       int retval = 0;
-
-       if (!PageLocked(page))
-               BUG();
-       switch (page_count(page)) {
-       case 3:
-               if (!page->buffers)
-                       break;
-               /* Fallthrough */
-       case 2:
-               if (!PageSwapCache(page))
-                       break;
-               retval = exclusive_swap_page(page);
-               break;
-       case 1:
-               if (PageReserved(page))
-                       break;
-               retval = 1;
-       }
-       return retval;
-}
-
-/*
- * Work out if there are any other processes sharing this
- * swap cache page. Free it if you can. Return success.
- */
-int fastcall remove_exclusive_swap_page(struct page *page)
-{
-       int retval;
-       struct swap_info_struct * p;
-       swp_entry_t entry;
-
-       if (!PageLocked(page))
-               BUG();
-       if (!PageSwapCache(page))
-               return 0;
-       if (page_count(page) - !!page->buffers != 2)    /* 2: us + cache */
-               return 0;
-
-       entry.val = page->index;
-       p = swap_info_get(entry);
-       if (!p)
-               return 0;
-
-       /* Is the only swap cache user the cache itself? */
-       retval = 0;
-       if (p->swap_map[SWP_OFFSET(entry)] == 1) {
-               /* Recheck the page count with the pagecache lock held.. */
-               spin_lock(&pagecache_lock);
-               if (page_count(page) - !!page->buffers == 2) {
-                       __delete_from_swap_cache(page);
-                       SetPageDirty(page);
-                       retval = 1;
-               }
-               spin_unlock(&pagecache_lock);
-       }
-       swap_info_put(p);
-
-       if (retval) {
-               block_flushpage(page, 0);
-               swap_free(entry);
-               page_cache_release(page);
-       }
-
-       return retval;
-}
-
-/*
- * Free the swap entry like above, but also try to
- * free the page cache entry if it is the last user.
- */
-void free_swap_and_cache(swp_entry_t entry)
-{
-       struct swap_info_struct * p;
-       struct page *page = NULL;
-
-       p = swap_info_get(entry);
-       if (p) {
-               if (swap_entry_free(p, SWP_OFFSET(entry)) == 1)
-                       page = find_trylock_page(&swapper_space, entry.val);
-               swap_info_put(p);
-       }
-       if (page) {
-               page_cache_get(page);
-               /* Only cache user (+us), or swap space full? Free it! */
-               if (page_count(page) - !!page->buffers == 2 || vm_swap_full()) {
-                       delete_from_swap_cache(page);
-                       SetPageDirty(page);
-               }
-               UnlockPage(page);
-               page_cache_release(page);
-       }
-}
-
-/*
- * The swap entry has been read in advance, and we return 1 to indicate
- * that the page has been used or is no longer needed.
- *
- * Always set the resulting pte to be nowrite (the same as COW pages
- * after one process has exited).  We don't know just how many PTEs will
- * share this swap entry, so be cautious and let do_wp_page work out
- * what to do if a write is requested later.
- */
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
-       pte_t *dir, swp_entry_t entry, struct page* page)
-{
-       pte_t pte = *dir;
-
-       if (likely(pte_to_swp_entry(pte).val != entry.val))
-               return;
-       if (unlikely(pte_none(pte) || pte_present(pte)))
-               return;
-       get_page(page);
-       set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
-       swap_free(entry);
-       ++vma->vm_mm->rss;
-}
-
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
-       unsigned long address, unsigned long size, unsigned long offset,
-       swp_entry_t entry, struct page* page)
-{
-       pte_t * pte;
-       unsigned long end;
-
-       if (pmd_none(*dir))
-               return;
-       if (pmd_bad(*dir)) {
-               pmd_ERROR(*dir);
-               pmd_clear(dir);
-               return;
-       }
-       pte = pte_offset(dir, address);
-       offset += address & PMD_MASK;
-       address &= ~PMD_MASK;
-       end = address + size;
-       if (end > PMD_SIZE)
-               end = PMD_SIZE;
-       do {
-               unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
-               address += PAGE_SIZE;
-               pte++;
-       } while (address && (address < end));
-}
-
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
-       unsigned long address, unsigned long size,
-       swp_entry_t entry, struct page* page)
-{
-       pmd_t * pmd;
-       unsigned long offset, end;
-
-       if (pgd_none(*dir))
-               return;
-       if (pgd_bad(*dir)) {
-               pgd_ERROR(*dir);
-               pgd_clear(dir);
-               return;
-       }
-       pmd = pmd_offset(dir, address);
-       offset = address & PGDIR_MASK;
-       address &= ~PGDIR_MASK;
-       end = address + size;
-       if (end > PGDIR_SIZE)
-               end = PGDIR_SIZE;
-       if (address >= end)
-               BUG();
-       do {
-               unuse_pmd(vma, pmd, address, end - address, offset, entry,
-                         page);
-               address = (address + PMD_SIZE) & PMD_MASK;
-               pmd++;
-       } while (address && (address < end));
-}
-
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
-                       swp_entry_t entry, struct page* page)
-{
-       unsigned long start = vma->vm_start, end = vma->vm_end;
-
-       if (start >= end)
-               BUG();
-       do {
-               unuse_pgd(vma, pgdir, start, end - start, entry, page);
-               start = (start + PGDIR_SIZE) & PGDIR_MASK;
-               pgdir++;
-       } while (start && (start < end));
-}
-
-static void unuse_process(struct mm_struct * mm,
-                       swp_entry_t entry, struct page* page)
-{
-       struct vm_area_struct* vma;
-
-       /*
-        * Go through process' page directory.
-        */
-       spin_lock(&mm->page_table_lock);
-       for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               pgd_t * pgd = pgd_offset(mm, vma->vm_start);
-               unuse_vma(vma, pgd, entry, page);
-       }
-       XEN_flush_page_update_queue();
-       spin_unlock(&mm->page_table_lock);
-       return;
-}
-
-/*
- * Scan swap_map from current position to next entry still in use.
- * Recycle to start on reaching the end, returning 0 when empty.
- */
-static int find_next_to_unuse(struct swap_info_struct *si, int prev)
-{
-       int max = si->max;
-       int i = prev;
-       int count;
-
-       /*
-        * No need for swap_device_lock(si) here: we're just looking
-        * for whether an entry is in use, not modifying it; false
-        * hits are okay, and sys_swapoff() has already prevented new
-        * allocations from this area (while holding swap_list_lock()).
-        */
-       for (;;) {
-               if (++i >= max) {
-                       if (!prev) {
-                               i = 0;
-                               break;
-                       }
-                       /*
-                        * No entries in use at top of swap_map,
-                        * loop back to start and recheck there.
-                        */
-                       max = prev + 1;
-                       prev = 0;
-                       i = 1;
-               }
-               count = si->swap_map[i];
-               if (count && count != SWAP_MAP_BAD)
-                       break;
-       }
-       return i;
-}
-
-/*
- * We completely avoid races by reading each swap page in advance,
- * and then search for the process using it.  All the necessary
- * page table adjustments can then be made atomically.
- */
-static int try_to_unuse(unsigned int type)
-{
-       struct swap_info_struct * si = &swap_info[type];
-       struct mm_struct *start_mm;
-       unsigned short *swap_map;
-       unsigned short swcount;
-       struct page *page;
-       swp_entry_t entry;
-       int i = 0;
-       int retval = 0;
-       int reset_overflow = 0;
-       int shmem;
-
-       /*
-        * When searching mms for an entry, a good strategy is to
-        * start at the first mm we freed the previous entry from
-        * (though actually we don't notice whether we or coincidence
-        * freed the entry).  Initialize this start_mm with a hold.
-        *
-        * A simpler strategy would be to start at the last mm we
-        * freed the previous entry from; but that would take less
-        * advantage of mmlist ordering (now preserved by swap_out()),
-        * which clusters forked address spaces together, most recent
-        * child immediately after parent.  If we race with dup_mmap(),
-        * we very much want to resolve parent before child, otherwise
-        * we may miss some entries: using last mm would invert that.
-        */
-       start_mm = &init_mm;
-       atomic_inc(&init_mm.mm_users);
-
-       /*
-        * Keep on scanning until all entries have gone.  Usually,
-        * one pass through swap_map is enough, but not necessarily:
-        * mmput() removes mm from mmlist before exit_mmap() and its
-        * zap_page_range().  That's not too bad, those entries are
-        * on their way out, and handled faster there than here.
-        * do_munmap() behaves similarly, taking the range out of mm's
-        * vma list before zap_page_range().  But unfortunately, when
-        * unmapping a part of a vma, it takes the whole out first,
-        * then reinserts what's left after (might even reschedule if
-        * open() method called) - so swap entries may be invisible
-        * to swapoff for a while, then reappear - but that is rare.
-        */
-       while ((i = find_next_to_unuse(si, i))) {
-               /* 
-                * Get a page for the entry, using the existing swap
-                * cache page if there is one.  Otherwise, get a clean
-                * page and read the swap into it. 
-                */
-               swap_map = &si->swap_map[i];
-               entry = SWP_ENTRY(type, i);
-               page = read_swap_cache_async(entry);
-               if (!page) {
-                       /*
-                        * Either swap_duplicate() failed because entry
-                        * has been freed independently, and will not be
-                        * reused since sys_swapoff() already disabled
-                        * allocation from here, or alloc_page() failed.
-                        */
-                       if (!*swap_map)
-                               continue;
-                       retval = -ENOMEM;
-                       break;
-               }
-
-               /*
-                * Don't hold on to start_mm if it looks like exiting.
-                */
-               if (atomic_read(&start_mm->mm_users) == 1) {
-                       mmput(start_mm);
-                       start_mm = &init_mm;
-                       atomic_inc(&init_mm.mm_users);
-               }
-
-               /*
-                * Wait for and lock page.  When do_swap_page races with
-                * try_to_unuse, do_swap_page can handle the fault much
-                * faster than try_to_unuse can locate the entry.  This
-                * apparently redundant "wait_on_page" lets try_to_unuse
-                * defer to do_swap_page in such a case - in some tests,
-                * do_swap_page and try_to_unuse repeatedly compete.
-                */
-               wait_on_page(page);
-               lock_page(page);
-
-               /*
-                * Remove all references to entry, without blocking.
-                * Whenever we reach init_mm, there's no address space
-                * to search, but use it as a reminder to search shmem.
-                */
-               shmem = 0;
-               swcount = *swap_map;
-               if (swcount > 1) {
-                       flush_page_to_ram(page);
-                       if (start_mm == &init_mm)
-                               shmem = shmem_unuse(entry, page);
-                       else
-                               unuse_process(start_mm, entry, page);
-               }
-               if (*swap_map > 1) {
-                       int set_start_mm = (*swap_map >= swcount);
-                       struct list_head *p = &start_mm->mmlist;
-                       struct mm_struct *new_start_mm = start_mm;
-                       struct mm_struct *mm;
-
-                       spin_lock(&mmlist_lock);
-                       while (*swap_map > 1 &&
-                                       (p = p->next) != &start_mm->mmlist) {
-                               mm = list_entry(p, struct mm_struct, mmlist);
-                               swcount = *swap_map;
-                               if (mm == &init_mm) {
-                                       set_start_mm = 1;
-                                       spin_unlock(&mmlist_lock);
-                                       shmem = shmem_unuse(entry, page);
-                                       spin_lock(&mmlist_lock);
-                               } else
-                                       unuse_process(mm, entry, page);
-                               if (set_start_mm && *swap_map < swcount) {
-                                       new_start_mm = mm;
-                                       set_start_mm = 0;
-                               }
-                       }
-                       atomic_inc(&new_start_mm->mm_users);
-                       spin_unlock(&mmlist_lock);
-                       mmput(start_mm);
-                       start_mm = new_start_mm;
-               }
-
-               /*
-                * How could swap count reach 0x7fff when the maximum
-                * pid is 0x7fff, and there's no way to repeat a swap
-                * page within an mm (except in shmem, where it's the
-                * shared object which takes the reference count)?
-                * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
-                *
-                * If that's wrong, then we should worry more about
-                * exit_mmap() and do_munmap() cases described above:
-                * we might be resetting SWAP_MAP_MAX too early here.
-                * We know "Undead"s can happen, they're okay, so don't
-                * report them; but do report if we reset SWAP_MAP_MAX.
-                */
-               if (*swap_map == SWAP_MAP_MAX) {
-                       swap_list_lock();
-                       swap_device_lock(si);
-                       nr_swap_pages++;
-                       *swap_map = 1;
-                       swap_device_unlock(si);
-                       swap_list_unlock();
-                       reset_overflow = 1;
-               }
-
-               /*
-                * If a reference remains (rare), we would like to leave
-                * the page in the swap cache; but try_to_swap_out could
-                * then re-duplicate the entry once we drop page lock,
-                * so we might loop indefinitely; also, that page could
-                * not be swapped out to other storage meanwhile.  So:
-                * delete from cache even if there's another reference,
-                * after ensuring that the data has been saved to disk -
-                * since if the reference remains (rarer), it will be
-                * read from disk into another page.  Splitting into two
-                * pages would be incorrect if swap supported "shared
-                * private" pages, but they are handled by tmpfs files.
-                *
-                * Note shmem_unuse already deleted swappage from cache,
-                * unless corresponding filepage found already in cache:
-                * in which case it left swappage in cache, lowered its
-                * swap count to pass quickly through the loops above,
-                * and now we must reincrement count to try again later.
-                */
-               if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
-                       rw_swap_page(WRITE, page);
-                       lock_page(page);
-               }
-               if (PageSwapCache(page)) {
-                       if (shmem)
-                               swap_duplicate(entry);
-                       else
-                               delete_from_swap_cache(page);
-               }
-
-               /*
-                * So we could skip searching mms once swap count went
-                * to 1, we did not mark any present ptes as dirty: must
-                * mark page dirty so try_to_swap_out will preserve it.
-                */
-               SetPageDirty(page);
-               UnlockPage(page);
-               page_cache_release(page);
-
-               /*
-                * Make sure that we aren't completely killing
-                * interactive performance.  Interruptible check on
-                * signal_pending() would be nice, but changes the spec?
-                */
-               if (current->need_resched)
-                       schedule();
-       }
-
-       mmput(start_mm);
-       if (reset_overflow) {
-               printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
-               swap_overflow = 0;
-       }
-       return retval;
-}
-
-asmlinkage long sys_swapoff(const char * specialfile)
-{
-       struct swap_info_struct * p = NULL;
-       unsigned short *swap_map;
-       struct nameidata nd;
-       int i, type, prev;
-       int err;
-       
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       err = user_path_walk(specialfile, &nd);
-       if (err)
-               goto out;
-
-       lock_kernel();
-       prev = -1;
-       swap_list_lock();
-       for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
-               p = swap_info + type;
-               if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
-                       if (p->swap_file == nd.dentry)
-                         break;
-               }
-               prev = type;
-       }
-       err = -EINVAL;
-       if (type < 0) {
-               swap_list_unlock();
-               goto out_dput;
-       }
-
-       if (prev < 0) {
-               swap_list.head = p->next;
-       } else {
-               swap_info[prev].next = p->next;
-       }
-       if (type == swap_list.next) {
-               /* just pick something that's safe... */
-               swap_list.next = swap_list.head;
-       }
-       nr_swap_pages -= p->pages;
-       total_swap_pages -= p->pages;
-       p->flags = SWP_USED;
-       swap_list_unlock();
-       unlock_kernel();
-       err = try_to_unuse(type);
-       lock_kernel();
-       if (err) {
-               /* re-insert swap space back into swap_list */
-               swap_list_lock();
-               for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
-                       if (p->prio >= swap_info[i].prio)
-                               break;
-               p->next = i;
-               if (prev < 0)
-                       swap_list.head = swap_list.next = p - swap_info;
-               else
-                       swap_info[prev].next = p - swap_info;
-               nr_swap_pages += p->pages;
-               total_swap_pages += p->pages;
-               p->flags = SWP_WRITEOK;
-               swap_list_unlock();
-               goto out_dput;
-       }
-       if (p->swap_device)
-               blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP);
-       path_release(&nd);
-
-       swap_list_lock();
-       swap_device_lock(p);
-       nd.mnt = p->swap_vfsmnt;
-       nd.dentry = p->swap_file;
-       p->swap_vfsmnt = NULL;
-       p->swap_file = NULL;
-       p->swap_device = 0;
-       p->max = 0;
-       swap_map = p->swap_map;
-       p->swap_map = NULL;
-       p->flags = 0;
-       swap_device_unlock(p);
-       swap_list_unlock();
-       vfree(swap_map);
-       err = 0;
-
-out_dput:
-       unlock_kernel();
-       path_release(&nd);
-out:
-       return err;
-}
-
-int get_swaparea_info(char *buf)
-{
-       char * page = (char *) __get_free_page(GFP_KERNEL);
-       struct swap_info_struct *ptr = swap_info;
-       int i, j, len = 0, usedswap;
-
-       if (!page)
-               return -ENOMEM;
-
-       len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
-       for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
-               if ((ptr->flags & SWP_USED) && ptr->swap_map) {
-                       char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt,
-                                               page, PAGE_SIZE);
-
-                       len += sprintf(buf + len, "%-31s ", path);
-
-                       if (!ptr->swap_device)
-                               len += sprintf(buf + len, "file\t\t");
-                       else
-                               len += sprintf(buf + len, "partition\t");
-
-                       usedswap = 0;
-                       for (j = 0; j < ptr->max; ++j)
-                               switch (ptr->swap_map[j]) {
-                                       case SWAP_MAP_BAD:
-                                       case 0:
-                                               continue;
-                                       default:
-                                               usedswap++;
-                               }
-                       len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), 
-                               usedswap << (PAGE_SHIFT - 10), ptr->prio);
-               }
-       }
-       free_page((unsigned long) page);
-       return len;
-}
-
-int is_swap_partition(kdev_t dev) {
-       struct swap_info_struct *ptr = swap_info;
-       int i;
-
-       for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
-               if (ptr->flags & SWP_USED)
-                       if (ptr->swap_device == dev)
-                               return 1;
-       }
-       return 0;
-}
-
-/*
- * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
- *
- * The swapon system call
- */
-asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
-{
-       struct swap_info_struct * p;
-       struct nameidata nd;
-       struct inode * swap_inode;
-       unsigned int type;
-       int i, j, prev;
-       int error;
-       static int least_priority = 0;
-       union swap_header *swap_header = 0;
-       int swap_header_version;
-       int nr_good_pages = 0;
-       unsigned long maxpages = 1;
-       int swapfilesize;
-       struct block_device *bdev = NULL;
-       unsigned short *swap_map;
-       
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-       lock_kernel();
-       swap_list_lock();
-       p = swap_info;
-       for (type = 0 ; type < nr_swapfiles ; type++,p++)
-               if (!(p->flags & SWP_USED))
-                       break;
-       error = -EPERM;
-       if (type >= MAX_SWAPFILES) {
-               swap_list_unlock();
-               goto out;
-       }
-       if (type >= nr_swapfiles)
-               nr_swapfiles = type+1;
-       p->flags = SWP_USED;
-       p->swap_file = NULL;
-       p->swap_vfsmnt = NULL;
-       p->swap_device = 0;
-       p->swap_map = NULL;
-       p->lowest_bit = 0;
-       p->highest_bit = 0;
-       p->cluster_nr = 0;
-       p->sdev_lock = SPIN_LOCK_UNLOCKED;
-       p->next = -1;
-       if (swap_flags & SWAP_FLAG_PREFER) {
-               p->prio =
-                 (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
-       } else {
-               p->prio = --least_priority;
-       }
-       swap_list_unlock();
-       error = user_path_walk(specialfile, &nd);
-       if (error)
-               goto bad_swap_2;
-
-       p->swap_file = nd.dentry;
-       p->swap_vfsmnt = nd.mnt;
-       swap_inode = nd.dentry->d_inode;
-       error = -EINVAL;
-
-       if (S_ISBLK(swap_inode->i_mode)) {
-               kdev_t dev = swap_inode->i_rdev;
-               struct block_device_operations *bdops;
-               devfs_handle_t de;
-
-               if (is_mounted(dev)) {
-                       error = -EBUSY;
-                       goto bad_swap_2;
-               }
-
-               p->swap_device = dev;
-               set_blocksize(dev, PAGE_SIZE);
-               
-               bd_acquire(swap_inode);
-               bdev = swap_inode->i_bdev;
-               de = devfs_get_handle_from_inode(swap_inode);
-               bdops = devfs_get_ops(de);  /*  Increments module use count  */
-               if (bdops) bdev->bd_op = bdops;
-
-               error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP);
-               devfs_put_ops(de);/*Decrement module use count now we're safe*/
-               if (error)
-                       goto bad_swap_2;
-               set_blocksize(dev, PAGE_SIZE);
-               error = -ENODEV;
-               if (!dev || (blk_size[MAJOR(dev)] &&
-                    !blk_size[MAJOR(dev)][MINOR(dev)]))
-                       goto bad_swap;
-               swapfilesize = 0;
-               if (blk_size[MAJOR(dev)])
-                       swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
-                               >> (PAGE_SHIFT - 10);
-       } else if (S_ISREG(swap_inode->i_mode))
-               swapfilesize = swap_inode->i_size >> PAGE_SHIFT;
-       else
-               goto bad_swap;
-
-       error = -EBUSY;
-       for (i = 0 ; i < nr_swapfiles ; i++) {
-               struct swap_info_struct *q = &swap_info[i];
-               if (i == type || !q->swap_file)
-                       continue;
-               if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping)
-                       goto bad_swap;
-       }
-
-       swap_header = (void *) __get_free_page(GFP_USER);
-       if (!swap_header) {
-               printk("Unable to start swapping: out of memory :-)\n");
-               error = -ENOMEM;
-               goto bad_swap;
-       }
-
-       lock_page(virt_to_page(swap_header));
-       rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header);
-
-       if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
-               swap_header_version = 1;
-       else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
-               swap_header_version = 2;
-       else {
-               printk("Unable to find swap-space signature\n");
-               error = -EINVAL;
-               goto bad_swap;
-       }
-       
-       switch (swap_header_version) {
-       case 1:
-               memset(((char *) swap_header)+PAGE_SIZE-10,0,10);
-               j = 0;
-               p->lowest_bit = 0;
-               p->highest_bit = 0;
-               for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
-                       if (test_bit(i,(char *) swap_header)) {
-                               if (!p->lowest_bit)
-                                       p->lowest_bit = i;
-                               p->highest_bit = i;
-                               maxpages = i+1;
-                               j++;
-                       }
-               }
-               nr_good_pages = j;
-               p->swap_map = vmalloc(maxpages * sizeof(short));
-               if (!p->swap_map) {
-                       error = -ENOMEM;                
-                       goto bad_swap;
-               }
-               for (i = 1 ; i < maxpages ; i++) {
-                       if (test_bit(i,(char *) swap_header))
-                               p->swap_map[i] = 0;
-                       else
-                               p->swap_map[i] = SWAP_MAP_BAD;
-               }
-               break;
-
-       case 2:
-               /* Check the swap header's sub-version and the size of
-                   the swap file and bad block lists */
-               if (swap_header->info.version != 1) {
-                       printk(KERN_WARNING
-                              "Unable to handle swap header version %d\n",
-                              swap_header->info.version);
-                       error = -EINVAL;
-                       goto bad_swap;
-               }
-
-               p->lowest_bit  = 1;
-               maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1;
-               if (maxpages > swap_header->info.last_page)
-                       maxpages = swap_header->info.last_page;
-               p->highest_bit = maxpages - 1;
-
-               error = -EINVAL;
-               if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
-                       goto bad_swap;
-               
-               /* OK, set up the swap map and apply the bad block list */
-               if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
-                       error = -ENOMEM;
-                       goto bad_swap;
-               }
-
-               error = 0;
-               memset(p->swap_map, 0, maxpages * sizeof(short));
-               for (i=0; i<swap_header->info.nr_badpages; i++) {
-                       int page = swap_header->info.badpages[i];
-                       if (page <= 0 || page >= swap_header->info.last_page)
-                               error = -EINVAL;
-                       else
-                               p->swap_map[page] = SWAP_MAP_BAD;
-               }
-               nr_good_pages = swap_header->info.last_page -
-                               swap_header->info.nr_badpages -
-                               1 /* header page */;
-               if (error) 
-                       goto bad_swap;
-       }
-       
-       if (swapfilesize && maxpages > swapfilesize) {
-               printk(KERN_WARNING
-                      "Swap area shorter than signature indicates\n");
-               error = -EINVAL;
-               goto bad_swap;
-       }
-       if (!nr_good_pages) {
-               printk(KERN_WARNING "Empty swap-file\n");
-               error = -EINVAL;
-               goto bad_swap;
-       }
-       p->swap_map[0] = SWAP_MAP_BAD;
-       swap_list_lock();
-       swap_device_lock(p);
-       p->max = maxpages;
-       p->flags = SWP_WRITEOK;
-       p->pages = nr_good_pages;
-       nr_swap_pages += nr_good_pages;
-       total_swap_pages += nr_good_pages;
-       printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
-              nr_good_pages<<(PAGE_SHIFT-10), p->prio);
-
-       /* insert swap space into swap_list: */
-       prev = -1;
-       for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
-               if (p->prio >= swap_info[i].prio) {
-                       break;
-               }
-               prev = i;
-       }
-       p->next = i;
-       if (prev < 0) {
-               swap_list.head = swap_list.next = p - swap_info;
-       } else {
-               swap_info[prev].next = p - swap_info;
-       }
-       swap_device_unlock(p);
-       swap_list_unlock();
-       error = 0;
-       goto out;
-bad_swap:
-       if (bdev)
-               blkdev_put(bdev, BDEV_SWAP);
-bad_swap_2:
-       swap_list_lock();
-       swap_map = p->swap_map;
-       nd.mnt = p->swap_vfsmnt;
-       nd.dentry = p->swap_file;
-       p->swap_device = 0;
-       p->swap_file = NULL;
-       p->swap_vfsmnt = NULL;
-       p->swap_map = NULL;
-       p->flags = 0;
-       if (!(swap_flags & SWAP_FLAG_PREFER))
-               ++least_priority;
-       swap_list_unlock();
-       if (swap_map)
-               vfree(swap_map);
-       path_release(&nd);
-out:
-       if (swap_header)
-               free_page((long) swap_header);
-       unlock_kernel();
-       return error;
-}
-
-void si_swapinfo(struct sysinfo *val)
-{
-       unsigned int i;
-       unsigned long nr_to_be_unused = 0;
-
-       swap_list_lock();
-       for (i = 0; i < nr_swapfiles; i++) {
-               unsigned int j;
-               if (swap_info[i].flags != SWP_USED)
-                       continue;
-               for (j = 0; j < swap_info[i].max; ++j) {
-                       switch (swap_info[i].swap_map[j]) {
-                               case 0:
-                               case SWAP_MAP_BAD:
-                                       continue;
-                               default:
-                                       nr_to_be_unused++;
-                       }
-               }
-       }
-       val->freeswap = nr_swap_pages + nr_to_be_unused;
-       val->totalswap = total_swap_pages + nr_to_be_unused;
-       swap_list_unlock();
-}
-
-/*
- * Verify that a swap entry is valid and increment its swap map count.
- *
- * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
- * "permanent", but will be reclaimed by the next swapoff.
- */
-int swap_duplicate(swp_entry_t entry)
-{
-       struct swap_info_struct * p;
-       unsigned long offset, type;
-       int result = 0;
-
-       type = SWP_TYPE(entry);
-       if (type >= nr_swapfiles)
-               goto bad_file;
-       p = type + swap_info;
-       offset = SWP_OFFSET(entry);
-
-       swap_device_lock(p);
-       if (offset < p->max && p->swap_map[offset]) {
-               if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
-                       p->swap_map[offset]++;
-                       result = 1;
-               } else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
-                       if (swap_overflow++ < 5)
-                               printk(KERN_WARNING "swap_dup: swap entry overflow\n");
-                       p->swap_map[offset] = SWAP_MAP_MAX;
-                       result = 1;
-               }
-       }
-       swap_device_unlock(p);
-out:
-       return result;
-
-bad_file:
-       printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
-       goto out;
-}
-
-/*
- * Prior swap_duplicate protects against swap device deletion.
- */
-void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, 
-                       kdev_t *dev, struct inode **swapf)
-{
-       unsigned long type;
-       struct swap_info_struct *p;
-
-       type = SWP_TYPE(entry);
-       if (type >= nr_swapfiles) {
-               printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val);
-               return;
-       }
-
-       p = &swap_info[type];
-       *offset = SWP_OFFSET(entry);
-       if (*offset >= p->max && *offset != 0) {
-               printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val);
-               return;
-       }
-       if (p->swap_map && !p->swap_map[*offset]) {
-               printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val);
-               return;
-       }
-       if (!(p->flags & SWP_USED)) {
-               printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val);
-               return;
-       }
-
-       if (p->swap_device) {
-               *dev = p->swap_device;
-       } else if (p->swap_file) {
-               *swapf = p->swap_file->d_inode;
-       } else {
-               printk(KERN_ERR "rw_swap_page: no swap file or device\n");
-       }
-       return;
-}
-
-/*
- * swap_device_lock prevents swap_map being freed. Don't grab an extra
- * reference on the swaphandle, it doesn't matter if it becomes unused.
- */
-int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
-{
-       int ret = 0, i = 1 << page_cluster;
-       unsigned long toff;
-       struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
-
-       if (!page_cluster)      /* no readahead */
-               return 0;
-       toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster;
-       if (!toff)              /* first page is swap header */
-               toff++, i--;
-       *offset = toff;
-
-       swap_device_lock(swapdev);
-       do {
-               /* Don't read-ahead past the end of the swap area */
-               if (toff >= swapdev->max)
-                       break;
-               /* Don't read in free or bad pages */
-               if (!swapdev->swap_map[toff])
-                       break;
-               if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
-                       break;
-               toff++;
-               ret++;
-       } while (--i);
-       swap_device_unlock(swapdev);
-       return ret;
-}
diff --git a/linux-2.4.29-xen-sparse/mm/vmalloc.c b/linux-2.4.29-xen-sparse/mm/vmalloc.c
deleted file mode 100644 (file)
index df02fcb..0000000
+++ /dev/null
@@ -1,385 +0,0 @@
-/*
- *  linux/mm/vmalloc.c
- *
- *  Copyright (C) 1993  Linus Torvalds
- *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
- */
-
-#include <linux/config.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/spinlock.h>
-#include <linux/highmem.h>
-#include <linux/smp_lock.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgalloc.h>
-
-rwlock_t vmlist_lock = RW_LOCK_UNLOCKED;
-struct vm_struct * vmlist;
-
-static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size)
-{
-       pte_t * pte;
-       unsigned long end;
-
-       if (pmd_none(*pmd))
-               return;
-       if (pmd_bad(*pmd)) {
-               pmd_ERROR(*pmd);
-               pmd_clear(pmd);
-               return;
-       }
-       pte = pte_offset(pmd, address);
-       address &= ~PMD_MASK;
-       end = address + size;
-       if (end > PMD_SIZE)
-               end = PMD_SIZE;
-       do {
-               pte_t page;
-               page = ptep_get_and_clear(pte);
-               address += PAGE_SIZE;
-               pte++;
-               if (pte_none(page))
-                       continue;
-               if (pte_present(page)) {
-                       struct page *ptpage = pte_page(page);
-                       if (VALID_PAGE(ptpage) && (!PageReserved(ptpage)))
-                               __free_page(ptpage);
-                       continue;
-               }
-               printk(KERN_CRIT "Whee.. Swapped out page in kernel page table\n");
-       } while (address < end);
-}
-
-static inline void free_area_pmd(pgd_t * dir, unsigned long address, unsigned long size)
-{
-       pmd_t * pmd;
-       unsigned long end;
-
-       if (pgd_none(*dir))
-               return;
-       if (pgd_bad(*dir)) {
-               pgd_ERROR(*dir);
-               pgd_clear(dir);
-               return;
-       }
-       pmd = pmd_offset(dir, address);
-       address &= ~PGDIR_MASK;
-       end = address + size;
-       if (end > PGDIR_SIZE)
-               end = PGDIR_SIZE;
-       do {
-               free_area_pte(pmd, address, end - address);
-               address = (address + PMD_SIZE) & PMD_MASK;
-               pmd++;
-       } while (address < end);
-}
-
-void vmfree_area_pages(unsigned long address, unsigned long size)
-{
-       pgd_t * dir;
-       unsigned long end = address + size;
-
-       dir = pgd_offset_k(address);
-       flush_cache_all();
-       do {
-               free_area_pmd(dir, address, end - address);
-               address = (address + PGDIR_SIZE) & PGDIR_MASK;
-               dir++;
-       } while (address && (address < end));
-       flush_tlb_all();
-}
-
-static inline int alloc_area_pte (pte_t * pte, unsigned long address,
-                       unsigned long size, int gfp_mask,
-                       pgprot_t prot, struct page ***pages)
-{
-       unsigned long end;
-
-       address &= ~PMD_MASK;
-       end = address + size;
-       if (end > PMD_SIZE)
-               end = PMD_SIZE;
-       do {
-               struct page * page;
-
-               if (!pages) {
-                       spin_unlock(&init_mm.page_table_lock);
-                       page = alloc_page(gfp_mask);
-                       spin_lock(&init_mm.page_table_lock);
-               } else {
-                       page = (**pages);
-                       (*pages)++;
-
-                       /* Add a reference to the page so we can free later */
-                       if (page)
-                               atomic_inc(&page->count);
-
-               }
-               if (!pte_none(*pte))
-                       printk(KERN_ERR "alloc_area_pte: page already exists\n");
-               if (!page)
-                       return -ENOMEM;
-               set_pte(pte, mk_pte(page, prot));
-               address += PAGE_SIZE;
-               pte++;
-       } while (address < end);
-       return 0;
-}
-
-static inline int alloc_area_pmd(pmd_t * pmd, unsigned long address,
-                       unsigned long size, int gfp_mask,
-                       pgprot_t prot, struct page ***pages)
-{
-       unsigned long end;
-
-       address &= ~PGDIR_MASK;
-       end = address + size;
-       if (end > PGDIR_SIZE)
-               end = PGDIR_SIZE;
-       do {
-               pte_t * pte = pte_alloc(&init_mm, pmd, address);
-               if (!pte)
-                       return -ENOMEM;
-               if (alloc_area_pte(pte, address, end - address,
-                                       gfp_mask, prot, pages))
-                       return -ENOMEM;
-               address = (address + PMD_SIZE) & PMD_MASK;
-               pmd++;
-       } while (address < end);
-       return 0;
-}
-
-/*static inline*/ int __vmalloc_area_pages (unsigned long address,
-                                       unsigned long size,
-                                       int gfp_mask,
-                                       pgprot_t prot,
-                                       struct page ***pages)
-{
-       pgd_t * dir;
-       unsigned long start = address;
-       unsigned long end = address + size;
-
-       dir = pgd_offset_k(address);
-       spin_lock(&init_mm.page_table_lock);
-       do {
-               pmd_t *pmd;
-               
-               pmd = pmd_alloc(&init_mm, dir, address);
-               if (!pmd)
-                       goto err;
-
-               if (alloc_area_pmd(pmd, address, end - address, gfp_mask, prot, pages))
-                       goto err;       // The kernel NEVER reclaims pmds, so no need to undo pmd_alloc() here
-
-               address = (address + PGDIR_SIZE) & PGDIR_MASK;
-               dir++;
-       } while (address && (address < end));
-       spin_unlock(&init_mm.page_table_lock);
-       flush_cache_all();
-       XEN_flush_page_update_queue();
-       return 0;
-err:
-       spin_unlock(&init_mm.page_table_lock);
-       flush_cache_all();
-       if (address > start)
-               vmfree_area_pages(start, address - start);
-       return -ENOMEM;
-}
-
-int vmalloc_area_pages(unsigned long address, unsigned long size,
-                      int gfp_mask, pgprot_t prot)
-{
-       return __vmalloc_area_pages(address, size, gfp_mask, prot, NULL);
-}
-
-struct vm_struct * get_vm_area(unsigned long size, unsigned long flags)
-{
-       unsigned long addr, next;
-       struct vm_struct **p, *tmp, *area;
-
-       area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL);
-       if (!area)
-               return NULL;
-
-       size += PAGE_SIZE;
-       if (!size) {
-               kfree (area);
-               return NULL;
-       }
-
-       addr = VMALLOC_START;
-       write_lock(&vmlist_lock);
-       for (p = &vmlist; (tmp = *p) ; p = &tmp->next) {
-               if ((size + addr) < addr)
-                       goto out;
-               if (size + addr <= (unsigned long) tmp->addr)
-                       break;
-               next = tmp->size + (unsigned long) tmp->addr;
-               if (next > addr) 
-                       addr = next;
-               if (addr > VMALLOC_END-size)
-                       goto out;
-       }
-       area->flags = flags;
-       area->addr = (void *)addr;
-       area->size = size;
-       area->next = *p;
-       *p = area;
-       write_unlock(&vmlist_lock);
-       return area;
-
-out:
-       write_unlock(&vmlist_lock);
-       kfree(area);
-       return NULL;
-}
-
-void __vfree(void * addr, int free_area_pages)
-{
-       struct vm_struct **p, *tmp;
-
-       if (!addr)
-               return;
-       if ((PAGE_SIZE-1) & (unsigned long) addr) {
-               printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
-               return;
-       }
-       write_lock(&vmlist_lock);
-       for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) {
-               if (tmp->addr == addr) {
-                       *p = tmp->next;
-                       if (free_area_pages)
-                               vmfree_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size);
-                       write_unlock(&vmlist_lock);
-                       kfree(tmp);
-                       return;
-               }
-       }
-       write_unlock(&vmlist_lock);
-       printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr);
-}
-
-void vfree(void * addr)
-{
-       __vfree(addr,1);
-}
-
-void * __vmalloc (unsigned long size, int gfp_mask, pgprot_t prot)
-{
-       void * addr;
-       struct vm_struct *area;
-
-       size = PAGE_ALIGN(size);
-       if (!size || (size >> PAGE_SHIFT) > num_physpages)
-               return NULL;
-       area = get_vm_area(size, VM_ALLOC);
-       if (!area)
-               return NULL;
-       addr = area->addr;
-       if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, gfp_mask,
-                                prot, NULL)) {
-               __vfree(addr, 0);
-               return NULL;
-       }
-       return addr;
-}
-
-void * vmap(struct page **pages, int count,
-           unsigned long flags, pgprot_t prot)
-{
-       void * addr;
-       struct vm_struct *area;
-       unsigned long size = count << PAGE_SHIFT;
-
-       if (!size || size > (max_mapnr << PAGE_SHIFT))
-               return NULL;
-       area = get_vm_area(size, flags);
-       if (!area) {
-               return NULL;
-       }
-       addr = area->addr;
-       if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, 0,
-                                prot, &pages)) {
-               __vfree(addr, 0);
-               return NULL;
-       }
-       return addr;
-}
-
-long vread(char *buf, char *addr, unsigned long count)
-{
-       struct vm_struct *tmp;
-       char *vaddr, *buf_start = buf;
-       unsigned long n;
-
-       /* Don't allow overflow */
-       if ((unsigned long) addr + count < count)
-               count = -(unsigned long) addr;
-
-       read_lock(&vmlist_lock);
-       for (tmp = vmlist; tmp; tmp = tmp->next) {
-               vaddr = (char *) tmp->addr;
-               if (addr >= vaddr + tmp->size - PAGE_SIZE)
-                       continue;
-               while (addr < vaddr) {
-                       if (count == 0)
-                               goto finished;
-                       *buf = '\0';
-                       buf++;
-                       addr++;
-                       count--;
-               }
-               n = vaddr + tmp->size - PAGE_SIZE - addr;
-               do {
-                       if (count == 0)
-                               goto finished;
-                       *buf = *addr;
-                       buf++;
-                       addr++;
-                       count--;
-               } while (--n > 0);
-       }
-finished:
-       read_unlock(&vmlist_lock);
-       return buf - buf_start;
-}
-
-long vwrite(char *buf, char *addr, unsigned long count)
-{
-       struct vm_struct *tmp;
-       char *vaddr, *buf_start = buf;
-       unsigned long n;
-
-       /* Don't allow overflow */
-       if ((unsigned long) addr + count < count)
-               count = -(unsigned long) addr;
-
-       read_lock(&vmlist_lock);
-       for (tmp = vmlist; tmp; tmp = tmp->next) {
-               vaddr = (char *) tmp->addr;
-               if (addr >= vaddr + tmp->size - PAGE_SIZE)
-                       continue;
-               while (addr < vaddr) {
-                       if (count == 0)
-                               goto finished;
-                       buf++;
-                       addr++;
-                       count--;
-               }
-               n = vaddr + tmp->size - PAGE_SIZE - addr;
-               do {
-                       if (count == 0)
-                               goto finished;
-                       *addr = *buf;
-                       buf++;
-                       addr++;
-                       count--;
-               } while (--n > 0);
-       }
-finished:
-       read_unlock(&vmlist_lock);
-       return buf - buf_start;
-}
index 2a8c5f200f97530c01bfe11426e4dee0c49344f2..1c2ba9b4a2b0fa2369b78b228472addd62063ab3 100644 (file)
@@ -114,10 +114,6 @@ config XEN_BLKDEV_TAP
          to a character device, allowing device prototyping in application
          space.  Odds are that you want to say N here.
 
-config XEN_WRITABLE_PAGETABLES
-       bool
-       default y
-
 config XEN_SCRUB_PAGES
        bool "Scrub memory before freeing it to Xen"
        default y
index e906f9852142bd6dad0b27459bb266b6c7bd3ed7..a781740c94722e81f208e3a92b97cd3376c5e153 100644 (file)
@@ -19,7 +19,6 @@ CONFIG_XEN_BLKDEV_FRONTEND=y
 CONFIG_XEN_NETDEV_FRONTEND=y
 # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set
 # CONFIG_XEN_BLKDEV_TAP is not set
-CONFIG_XEN_WRITABLE_PAGETABLES=y
 CONFIG_XEN_SCRUB_PAGES=y
 CONFIG_X86=y
 # CONFIG_X86_64 is not set
index 95dee5b159838a68b1bcbaa6dc08e5db67cea501..b1fc951a81f426b1e45be50ede3cfcc93c77986e 100644 (file)
@@ -16,7 +16,6 @@ CONFIG_XEN_BLKDEV_FRONTEND=y
 CONFIG_XEN_NETDEV_FRONTEND=y
 # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set
 # CONFIG_XEN_BLKDEV_TAP is not set
-CONFIG_XEN_WRITABLE_PAGETABLES=y
 CONFIG_XEN_SCRUB_PAGES=y
 CONFIG_X86=y
 # CONFIG_X86_64 is not set
index b8829c8cdc00ae6fa1881588691051b27e3af997..b7c29174fc37671222bd4fe27f850a366baac076 100644 (file)
@@ -963,7 +963,7 @@ void __init trap_init(void)
         * and a callgate to lcall27 for Solaris/x86 binaries
         */
        make_lowmem_page_readonly(&default_ldt[0]);
-       xen_flush_page_update_queue();
+       flush_page_update_queue();
 
        /*
         * Should be a barrier for any external CPU state.
index 7a0b091ca3c4beff1b493358a2fe251158e90bf6..0cac0f30c37f7a70412103e5a4e0d8619f48843b 100644 (file)
@@ -553,7 +553,6 @@ vmalloc_fault:
                if (!pmd_present(*pmd_k))
                        goto no_context;
                set_pmd(pmd, *pmd_k);
-               xen_flush_page_update_queue(); /* flush PMD update */
 
                pte_k = pte_offset_kernel(pmd_k, address);
                if (!pte_present(*pte_k))
index 62427b2301954d367f1225226c91050d0398dfd6..368179d56015b63b7a7ac93146b588f1a7f21977 100644 (file)
  */
 static spinlock_t update_lock = SPIN_LOCK_UNLOCKED;
 
-/* Linux 2.6 isn't using the traditional batched interface. */
+#define QUEUE_SIZE 1 /*128*/
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
-#define QUEUE_SIZE 2048
 #define pte_offset_kernel pte_offset
-#define pmd_val_ma(v) (v).pmd;
 #define pud_t pgd_t
 #define pud_offset(d, va) d
 #else
-#ifdef CONFIG_SMP
-#define QUEUE_SIZE 1
-#else
-#define QUEUE_SIZE 128
-#endif
 #define pmd_val_ma(v) (v).pud.pgd.pgd;
 #endif
 
index 6fe3f08632115a49ae19a58823464d0658828e99..2682ac5b9056f426968c9a347fb1c87bc8fcdbb3 100644 (file)
@@ -195,7 +195,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
        pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
        if (pte) {
                make_page_readonly(pte);
-               xen_flush_page_update_queue();
+               flush_page_update_queue();
        }
        return pte;
 }
index f69db851a4fc77b310c6df7ef01f550fc7110e66..36c934fc5da475aaa4cfd73ae510483bc08c70b5 100644 (file)
@@ -109,10 +109,8 @@ static void __do_suspend(void)
 
     HYPERVISOR_vm_assist(VMASST_CMD_enable,
                         VMASST_TYPE_4gb_segments);
-#ifdef CONFIG_XEN_WRITABLE_PAGETABLES
     HYPERVISOR_vm_assist(VMASST_CMD_enable,
                         VMASST_TYPE_writable_pagetables);
-#endif
 
     shutting_down = -1; 
 
index 345b8264b80c936fc7a747a97641e5215f85d884..1379b4969479a0e49b27be640cf6a841e1e5c5ac 100644 (file)
@@ -111,7 +111,7 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 static inline unsigned long pgd_val(pgd_t x)
 {
        unsigned long ret = x.pgd;
-       if (ret) ret = machine_to_phys(ret);
+       if (ret) ret = machine_to_phys(ret) | 1;
        return ret;
 }
 #define pgprot_val(x)  ((x).pgprot)
index d932c6c17f3be9c2012d2fd92e0bc5b8afb94cd4..dfc5b1e155ad0781a1ca439e161e1561f5e7504f 100644 (file)
@@ -407,7 +407,6 @@ extern void noexec_setup(const char *str);
        do {                                                              \
                if (__dirty) {                                            \
                        if ( likely((__vma)->vm_mm == current->mm) ) {    \
-                           xen_flush_page_update_queue();                \
                            HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG); \
                        } else {                                          \
                             xen_l1_entry_update((__ptep), (__entry).pte_low); \
@@ -426,7 +425,6 @@ do {                                                                        \
 #define ptep_establish_new(__vma, __address, __ptep, __entry)          \
 do {                                                                   \
        if (likely((__vma)->vm_mm == current->mm)) {                    \
-               xen_flush_page_update_queue();                          \
                HYPERVISOR_update_va_mapping((__address),               \
                                             __entry, 0);               \
        } else {                                                        \
index 4d77312f6e03279e7efed0036ffd435a1fe722f5..568e84bc2fe6167d4809dda07fe7c9bf1921b8c9 100644 (file)
@@ -117,8 +117,6 @@ void _flush_page_update_queue(void);
     if (per_cpu(mmu_update_queue_idx, smp_processor_id()))     \
        _flush_page_update_queue();                             \
 } while (0)
-#define xen_flush_page_update_queue() (_flush_page_update_queue())
-#define XEN_flush_page_update_queue() (_flush_page_update_queue())
 void MULTICALL_flush_page_update_queue(void);
 
 #ifdef CONFIG_XEN_PHYSDEV_ACCESS